diff --git a/.github/actions/setup-uv/action.yml b/.github/actions/setup-uv/action.yml
index 619b138fb2..88a73e8481 100644
--- a/.github/actions/setup-uv/action.yml
+++ b/.github/actions/setup-uv/action.yml
@@ -4,8 +4,9 @@ runs:
   using: 'composite'
   steps:
     - name: Install uv
-      uses: astral-sh/setup-uv@v3
+      uses: astral-sh/setup-uv@v5
       with:
-        version: "0.5.1"
+        version: "0.5.17"
         enable-cache: true
         cache-dependency-glob: "**/pyproject.toml"
+        python-version: ${{ matrix.python-version }}
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
deleted file mode 100644
index 4dc8c76c1a..0000000000
--- a/.github/workflows/integration-tests.yml
+++ /dev/null
@@ -1,82 +0,0 @@
-name: integration
-
-on:
-  push:
-    branches:
-      - main
-  pull_request:
-    types: [opened, synchronize, reopened]
-  workflow_dispatch:
-    inputs:
-      trainer_branch:
-        description: "Branch of Trainer to test"
-        required: false
-        default: "main"
-      coqpit_branch:
-        description: "Branch of Coqpit to test"
-        required: false
-        default: "main"
-jobs:
-  test:
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.9", "3.12"]
-        subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup uv
-        uses: ./.github/actions/setup-uv
-      - name: Set up Python ${{ matrix.python-version }}
-        run: uv python install ${{ matrix.python-version }}
-      - name: Install Espeak
-        if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset)
-        run: |
-          sudo apt-get update
-          sudo apt-get install espeak espeak-ng
-      - name: Install dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends git make gcc
-          make system-deps
-      - name: Install custom Trainer and/or Coqpit if requested
-        run: |
-          if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then
-            uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }}
-          fi
-          if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then
-            uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }}
-          fi
-      - name: Integration tests
-        run: |
-          resolution=highest
-          if [ "${{ matrix.python-version }}" == "3.9" ]; then
-            resolution=lowest-direct
-          fi
-          uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }}
-      - name: Upload coverage data
-        uses: actions/upload-artifact@v4
-        with:
-          include-hidden-files: true
-          name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }}
-          path: .coverage.*
-          if-no-files-found: ignore
-  coverage:
-    if: always()
-    needs: test
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup uv
-        uses: ./.github/actions/setup-uv
-      - uses: actions/download-artifact@v4
-        with:
-          pattern: coverage-data-*
-          merge-multiple: true
-      - name: Combine coverage
-        run: |
-          uv python install
-          uvx coverage combine
-          uvx coverage html --skip-covered --skip-empty
-          uvx coverage report --format=markdown >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml
index 1b7f44654c..ef74c60da6 100644
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@@ -46,8 +46,8 @@ jobs:
     steps:
       - uses: actions/download-artifact@v4
         with:
-          path: dist
-          pattern: build
+          path: "dist/"
+          name: build
       - run: |
           ls -lh dist/
       - name: Publish package distributions to PyPI
diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml
index d1060f6be2..03426808cc 100644
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@@ -9,15 +9,9 @@ on:
 jobs:
   lint:
     runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: [3.9]
     steps:
       - uses: actions/checkout@v4
       - name: Setup uv
         uses: ./.github/actions/setup-uv
-      - name: Set up Python ${{ matrix.python-version }}
-        run: uv python install ${{ matrix.python-version }}
       - name: Lint check
         run: make lint
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 576de150fd..fdacf0acc9 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -1,4 +1,4 @@
-name: unit
+name: test
 
 on:
   push:
@@ -17,19 +17,17 @@ on:
         required: false
         default: "main"
 jobs:
-  test:
+  unit:
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.9, "3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11", "3.12"]
         subset: ["data_tests", "inference_tests", "test_aux", "test_text"]
     steps:
       - uses: actions/checkout@v4
       - name: Setup uv
         uses: ./.github/actions/setup-uv
-      - name: Set up Python ${{ matrix.python-version }}
-        run: uv python install ${{ matrix.python-version }}
       - name: Install Espeak
         if: contains(fromJSON('["inference_tests", "test_text"]'), matrix.subset)
         run: |
@@ -37,7 +35,6 @@ jobs:
           sudo apt-get install espeak espeak-ng
       - name: Install dependencies
         run: |
-          sudo apt-get update
           sudo apt-get install -y --no-install-recommends git make gcc
           make system-deps
       - name: Install custom Trainer and/or Coqpit if requested
@@ -51,7 +48,7 @@ jobs:
       - name: Unit tests
         run: |
           resolution=highest
-          if [ "${{ matrix.python-version }}" == "3.9" ]; then
+          if [ "${{ matrix.python-version }}" == "3.10" ]; then
             resolution=lowest-direct
           fi
           uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }}
@@ -61,10 +58,90 @@ jobs:
           include-hidden-files: true
           name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }}
           path: .coverage.*
-          if-no-files-found: ignore
+  integration:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.12"]
+        shard: [0, 1, 2, 3, 4]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup uv
+        uses: ./.github/actions/setup-uv
+      - name: Install Espeak
+        run: |
+          sudo apt-get update
+          sudo apt-get install espeak espeak-ng
+      - name: Install dependencies
+        run: |
+          sudo apt-get install -y --no-install-recommends git make gcc
+          make system-deps
+      - name: Install custom Trainer and/or Coqpit if requested
+        run: |
+          if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then
+            uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }}
+          fi
+          if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then
+            uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }}
+          fi
+      - name: Integration tests for shard ${{ matrix.shard }}
+        run: |
+          uv run pytest tests/integration --collect-only --quiet | grep "::" > integration_tests.txt
+          total_shards=5
+          shard_tests=$(awk "NR % $total_shards == ${{ matrix.shard }}" integration_tests.txt)
+          resolution=highest
+          if [ "${{ matrix.python-version }}" == "3.10" ]; then
+            resolution=lowest-direct
+          fi
+          uv run --resolution=$resolution --extra languages coverage run -m pytest -x -v --durations=0 $shard_tests
+      - name: Upload coverage data
+        uses: actions/upload-artifact@v4
+        with:
+          include-hidden-files: true
+          name: coverage-data-integration-${{ matrix.shard }}-${{ matrix.python-version }}
+          path: .coverage.*
+  zoo:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.12"]
+        partition: ["0", "1", "2"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup uv
+        uses: ./.github/actions/setup-uv
+      - name: Install Espeak
+        run: |
+          sudo apt-get update
+          sudo apt-get install espeak espeak-ng
+      - name: Install dependencies
+        run: |
+          sudo apt-get install -y --no-install-recommends git make gcc
+          make system-deps
+      - name: Install custom Trainer and/or Coqpit if requested
+        run: |
+          if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then
+            uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }}
+          fi
+          if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then
+            uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }}
+          fi
+      - name: Zoo tests
+        run: uv run --extra server --extra languages make test_zoo
+        env:
+          NUM_PARTITIONS: 3
+          TEST_PARTITION: ${{ matrix.partition }}
+      - name: Upload coverage data
+        uses: actions/upload-artifact@v4
+        with:
+          include-hidden-files: true
+          name: coverage-data-zoo-${{ matrix.partition }}
+          path: .coverage.*
   coverage:
     if: always()
-    needs: test
+    needs: [unit, integration, zoo]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 92f6f3ab3c..2f070ad085 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,16 +2,14 @@ repos:
   - repo: "https://github.com/pre-commit/pre-commit-hooks"
     rev: v5.0.0
     hooks:
+      - id: check-json
+        files: "TTS/.models.json"
       - id: check-yaml
       - id: end-of-file-fixer
       - id: trailing-whitespace
-  - repo: "https://github.com/psf/black"
-    rev: 24.2.0
-    hooks:
-      - id: black
-        language_version: python3
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.7.0
+    rev: v0.9.1
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
+      - id: ruff-format
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d4a8cf0090..5fe9421442 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,30 +11,25 @@ You can contribute not only with code but with bug reports, comments, questions,
 
 If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers.
 
-- [Development Road Map](https://github.com/coqui-ai/TTS/issues/378)
-
-    You can pick something out of our road map. We keep the progess of the project in this simple issue thread. It has new model proposals or developmental updates etc.
-
 - [Github Issues Tracker](https://github.com/idiap/coqui-ai-TTS/issues)
 
     This is a place to find feature requests, bugs.
 
-    Issues with the ```good first issue``` tag are good place for beginners to take on.
-
-- ✨**PR**✨ [pages](https://github.com/idiap/coqui-ai-TTS/pulls) with the ```🚀new version``` tag.
-
-    We list all the target improvements for the next version. You can pick one of them and start contributing.
+    Issues with the ```good first issue``` tag are good place for beginners to
+    take on. Issues tagged with `help wanted` are suited for more experienced
+    outside contributors.
 
 - Also feel free to suggest new features, ideas and models. We're always open for new things.
 
-## Call for sharing language models
+## Call for sharing pretrained models
 If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified.
 
 This model can be shared in two ways:
 1. Share the model files with us and we serve them with the next 🐸 TTS release.
 2. Upload your models on GDrive and share the link.
 
-Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.
+Models are served under `.models.json` file and any model is available under TTS
+CLI and Python API end points.
 
 Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/discussions/930).
 
@@ -93,7 +88,7 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
     uv run make test_all  # run all the tests, report all the errors
     ```
 
-9. Format your code. We use ```black``` for code formatting.
+9. Format your code. We use ```ruff``` for code formatting.
 
     ```bash
     make style
@@ -135,7 +130,8 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
 
 13. Let's discuss until it is perfect. 💪
 
-    We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/idiap/coqui-ai-TTS/pulls].
+    We might ask you for certain changes that would appear in the
+    [Github ✨**PR**✨'s page](https://github.com/idiap/coqui-ai-TTS/pulls).
 
 14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
 
@@ -143,9 +139,9 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
 
 If you prefer working within a Docker container as your development environment, you can do the following:
 
-1. Fork 🐸TTS[https://github.com/idiap/coqui-ai-TTS] by clicking the fork button at the top right corner of the project page.
+1. Fork the 🐸TTS [Github repository](https://github.com/idiap/coqui-ai-TTS) by clicking the fork button at the top right corner of the page.
 
-2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
+2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```.
 
     ```bash
     git clone git@github.com:<your Github name>/coqui-ai-TTS.git
diff --git a/Makefile b/Makefile
index 1d6867f5e8..da714e7b34 100644
--- a/Makefile
+++ b/Makefile
@@ -6,62 +6,46 @@ help:
 
 target_dirs := tests TTS notebooks recipes
 
-test_all:	## run tests and don't stop on an error.
-	nose2 --with-coverage --coverage TTS tests
-	./run_bash_tests.sh
-
 test:	## run tests.
-	coverage run -m nose2 -F -v -B tests
+	coverage run -m pytest -x -v --durations=0 tests
 
 test_vocoder:	## run vocoder tests.
-	coverage run -m nose2 -F -v -B tests.vocoder_tests
+	coverage run -m pytest -x -v --durations=0 tests/vocoder_tests
 
 test_tts:	## run tts tests.
-	coverage run -m nose2 -F -v -B tests.tts_tests
-
-test_tts2:	## run tts tests.
-	coverage run -m nose2 -F -v -B tests.tts_tests2
-
-test_xtts:
-	coverage run -m nose2 -F -v -B tests.xtts_tests
+	coverage run -m pytest -x -v --durations=0 tests/tts_tests
 
 test_aux:	## run aux tests.
-	coverage run -m nose2 -F -v -B tests.aux_tests
-	./run_bash_tests.sh
+	coverage run -m pytest -x -v --durations=0 tests/aux_tests
 
-test_zoo0:	## run zoo tests.
-	coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_0_step_3 \
-	tests.zoo_tests.test_models.test_voice_conversion
-test_zoo1:	## run zoo tests.
-	coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_1_step_3
-test_zoo2:	## run zoo tests.
-	coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_2_step_3
+test_zoo:	## run zoo tests.
+	coverage run -m pytest -x -v --durations=0 tests/zoo_tests/test_models.py
+
+test_zoo_big:	## run tests for models that are too big for CI.
+	coverage run -m pytest -x -v --durations=0 tests/zoo_tests/test_big_models.py
 
 inference_tests: ## run inference tests.
-	coverage run -m nose2 -F -v -B tests.inference_tests
+	coverage run -m pytest -x -v --durations=0 tests/inference_tests
 
 data_tests: ## run data tests.
-	coverage run -m nose2 -F -v -B tests.data_tests
+	coverage run -m pytest -x -v --durations=0 tests/data_tests
 
 test_text: ## run text tests.
-	coverage run -m nose2 -F -v -B tests.text_tests
+	coverage run -m pytest -x -v --durations=0 tests/text_tests
 
 test_failed:  ## only run tests failed the last time.
-	coverage run -m nose2 -F -v -B tests
+	coverage run -m pytest -x -v --last-failed tests
 
 style:	## update code style.
-	uv run --only-dev black ${target_dirs}
+	uv run --only-dev ruff format ${target_dirs}
 
 lint:	## run linters.
 	uv run --only-dev ruff check ${target_dirs}
-	uv run --only-dev black ${target_dirs} --check
+	uv run --only-dev ruff format ${target_dirs} --check
 
 system-deps:	## install linux system deps
 	sudo apt-get install -y libsndfile1-dev
 
-build-docs: ## build the docs
-	cd docs && make clean && make build
-
 install:	## install 🐸 TTS
 	uv sync --all-extras
 
@@ -70,4 +54,4 @@ install_dev:	## install 🐸 TTS for development.
 	uv run pre-commit install
 
 docs:	## build the docs
-	$(MAKE) -C docs clean && $(MAKE) -C docs html
+	uv run --group docs $(MAKE) -C docs clean && uv run --group docs $(MAKE) -C docs html
diff --git a/README.md b/README.md
index 5ca825b6ba..db8868b26d 100644
--- a/README.md
+++ b/README.md
@@ -1,40 +1,34 @@
+# <img src="https://raw.githubusercontent.com/idiap/coqui-ai-TTS/main/images/coqui-log-green-TTS.png" height="56"/>
 
-## 🐸Coqui TTS News
-- 📣 Fork of the [original, unmaintained repository](https://github.com/coqui-ai/TTS). New PyPI package: [coqui-tts](https://pypi.org/project/coqui-tts)
-- 📣 Prebuilt wheels are now also published for Mac and Windows (in addition to Linux as before) for easier installation across platforms.
-- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
-- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech).
-- 📣 ⓍTTS can now stream with <200ms latency.
-- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://coqui-tts.readthedocs.io/en/latest/models/xtts.html)
-- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/bark.html)
-- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
 
-## <img src="https://raw.githubusercontent.com/idiap/coqui-ai-TTS/main/images/coqui-log-green-TTS.png" height="56"/>
-
-
-**🐸TTS is a library for advanced Text-to-Speech generation.**
+**🐸 Coqui TTS is a library for advanced Text-to-Speech generation.**
 
 🚀 Pretrained models in +1100 languages.
 
 🛠️ Tools for training new models and fine-tuning existing models in any language.
 
 📚 Utilities for dataset analysis and curation.
-______________________________________________________________________
 
 [![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/coqui-tts)](https://pypi.org/project/coqui-tts/)
 [![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
-[![PyPI version](https://badge.fury.io/py/coqui-tts.svg)](https://badge.fury.io/py/coqui-tts)
+[![PyPI version](https://badge.fury.io/py/coqui-tts.svg)](https://pypi.org/project/coqui-tts/)
 [![Downloads](https://pepy.tech/badge/coqui-tts)](https://pepy.tech/project/coqui-tts)
 [![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440)
-
-![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/tests.yml/badge.svg)
-![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/docker.yaml/badge.svg)
-![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/style_check.yml/badge.svg)
+[![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/tests.yml/badge.svg)](https://github.com/idiap/coqui-ai-TTS/actions/workflows/tests.yml)
+[![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/docker.yaml/badge.svg)](https://github.com/idiap/coqui-ai-TTS/actions/workflows/docker.yaml)
+[![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/style_check.yml/badge.svg)](https://github.com/idiap/coqui-ai-TTS/actions/workflows/style_check.yml)
 [![Docs](<https://readthedocs.org/projects/coqui-tts/badge/?version=latest&style=plastic>)](https://coqui-tts.readthedocs.io/en/latest/)
 
 </div>
 
-______________________________________________________________________
+## 📣 News
+- **Fork of the [original, unmaintained repository](https://github.com/coqui-ai/TTS). New PyPI package: [coqui-tts](https://pypi.org/project/coqui-tts)**
+- 0.25.0: [OpenVoice](https://github.com/myshell-ai/OpenVoice) models now available for voice conversion.
+- 0.24.2: Prebuilt wheels are now also published for Mac and Windows (in addition to Linux as before) for easier installation across platforms.
+- 0.20.0: XTTSv2 is here with 17 languages and better performance across the board. XTTS can stream with <200ms latency.
+- 0.19.0: XTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech).
+- 0.14.1: You can use [Fairseq models in ~1100 languages](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
 
 ## 💬 Where to ask questions
 Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it.
@@ -64,70 +58,68 @@ repository are also still a useful source of information.
 | 🚀 **Released Models**            | [Standard models](https://github.com/idiap/coqui-ai-TTS/blob/dev/TTS/.models.json) and [Fairseq models in ~1100 languages](https://github.com/idiap/coqui-ai-TTS#example-text-to-speech-using-fairseq-models-in-1100-languages-)|
 
 ## Features
-- High-performance Deep Learning models for Text2Speech tasks. See lists of models below.
-- Fast and efficient model training.
-- Detailed training logs on the terminal and Tensorboard.
-- Support for Multi-speaker TTS.
-- Efficient, flexible, lightweight but feature complete `Trainer API`.
+- High-performance text-to-speech and voice conversion models, see list below.
+- Fast and efficient model training with detailed training logs on the terminal and Tensorboard.
+- Support for multi-speaker and multilingual TTS.
 - Released and ready-to-use models.
-- Tools to curate Text2Speech datasets under```dataset_analysis```.
-- Utilities to use and test your models.
+- Tools to curate TTS datasets under ```dataset_analysis/```.
+- Command line and Python APIs to use and test your models.
 - Modular (but not too much) code base enabling easy implementation of new ideas.
 
 ## Model Implementations
 ### Spectrogram models
-- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
-- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
-- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
-- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
-- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
-- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
-- FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
-- FastSpeech2: [paper](https://arxiv.org/abs/2006.04558)
-- SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
-- Capacitron: [paper](https://arxiv.org/abs/1906.03402)
-- OverFlow: [paper](https://arxiv.org/abs/2211.06892)
-- Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320)
-- Delightful TTS: [paper](https://arxiv.org/abs/2110.12612)
+- [Tacotron](https://arxiv.org/abs/1703.10135), [Tacotron2](https://arxiv.org/abs/1712.05884)
+- [Glow-TTS](https://arxiv.org/abs/2005.11129), [SC-GlowTTS](https://arxiv.org/abs/2104.05557)
+- [Speedy-Speech](https://arxiv.org/abs/2008.03802)
+- [Align-TTS](https://arxiv.org/abs/2003.01950)
+- [FastPitch](https://arxiv.org/pdf/2006.06873.pdf)
+- [FastSpeech](https://arxiv.org/abs/1905.09263), [FastSpeech2](https://arxiv.org/abs/2006.04558)
+- [Capacitron](https://arxiv.org/abs/1906.03402)
+- [OverFlow](https://arxiv.org/abs/2211.06892)
+- [Neural HMM TTS](https://arxiv.org/abs/2108.13320)
+- [Delightful TTS](https://arxiv.org/abs/2110.12612)
 
 ### End-to-End Models
-- ⓍTTS: [blog](https://coqui.ai/blog/tts/open_xtts)
-- VITS: [paper](https://arxiv.org/pdf/2106.06103)
-- 🐸 YourTTS: [paper](https://arxiv.org/abs/2112.02418)
-- 🐢 Tortoise: [orig. repo](https://github.com/neonbjb/tortoise-tts)
-- 🐶 Bark: [orig. repo](https://github.com/suno-ai/bark)
-
-### Attention Methods
-- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
-- Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
-- Graves Attention: [paper](https://arxiv.org/abs/1910.10288)
-- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
-- Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
-- Alignment Network: [paper](https://arxiv.org/abs/2108.10447)
-
-### Speaker Encoder
-- GE2E: [paper](https://arxiv.org/abs/1710.10467)
-- Angular Loss: [paper](https://arxiv.org/pdf/2003.11982.pdf)
+- [XTTS](https://arxiv.org/abs/2406.04904)
+- [VITS](https://arxiv.org/pdf/2106.06103)
+- 🐸[YourTTS](https://arxiv.org/abs/2112.02418)
+- 🐢[Tortoise](https://github.com/neonbjb/tortoise-tts)
+- 🐶[Bark](https://github.com/suno-ai/bark)
 
 ### Vocoders
-- MelGAN: [paper](https://arxiv.org/abs/1910.06711)
-- MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106)
-- ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480)
-- GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646)
-- WaveRNN: [origin](https://github.com/fatchord/WaveRNN/)
-- WaveGrad: [paper](https://arxiv.org/abs/2009.00713)
-- HiFiGAN: [paper](https://arxiv.org/abs/2010.05646)
-- UnivNet: [paper](https://arxiv.org/abs/2106.07889)
+- [MelGAN](https://arxiv.org/abs/1910.06711)
+- [MultiBandMelGAN](https://arxiv.org/abs/2005.05106)
+- [ParallelWaveGAN](https://arxiv.org/abs/1910.11480)
+- [GAN-TTS discriminators](https://arxiv.org/abs/1909.11646)
+- [WaveRNN](https://github.com/fatchord/WaveRNN/)
+- [WaveGrad](https://arxiv.org/abs/2009.00713)
+- [HiFiGAN](https://arxiv.org/abs/2010.05646)
+- [UnivNet](https://arxiv.org/abs/2106.07889)
 
 ### Voice Conversion
-- FreeVC: [paper](https://arxiv.org/abs/2210.15418)
+- [FreeVC](https://arxiv.org/abs/2210.15418)
+- [kNN-VC](https://doi.org/10.21437/Interspeech.2023-419)
+- [OpenVoice](https://arxiv.org/abs/2312.01479)
+
+### Others
+- Attention methods: [Guided Attention](https://arxiv.org/abs/1710.08969),
+  [Forward Backward Decoding](https://arxiv.org/abs/1907.09006),
+  [Graves Attention](https://arxiv.org/abs/1910.10288),
+  [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/),
+  [Dynamic Convolutional Attention](https://arxiv.org/pdf/1910.10288.pdf),
+  [Alignment Network](https://arxiv.org/abs/2108.10447)
+- Speaker encoders: [GE2E](https://arxiv.org/abs/1710.10467),
+  [Angular Loss](https://arxiv.org/pdf/2003.11982.pdf)
 
 You can also help us implement more models.
 
+<!-- start installation -->
 ## Installation
-🐸TTS is tested on Ubuntu 22.04 with **python >= 3.9, < 3.13.**.
 
-If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
+🐸TTS is tested on Ubuntu 24.04 with **python >= 3.10, < 3.13**, but should also
+work on Mac and Windows.
+
+If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the pretrained 🐸TTS models, installing from PyPI is the easiest option.
 
 ```bash
 pip install coqui-tts
@@ -165,24 +157,21 @@ pip install -e .[server,ja]
 
 ### Platforms
 
-If you are on Ubuntu (Debian), you can also run following commands for installation.
+If you are on Ubuntu (Debian), you can also run the following commands for installation.
 
 ```bash
-make system-deps  # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
+make system-deps
 make install
 ```
 
-If you are on Windows, 👑@GuyPaddock wrote installation instructions
-[here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system)
-(note that these are out of date, e.g. you need to have at least Python 3.9).
-
+<!-- end installation -->
 
 ## Docker Image
-You can also try TTS without install with the docker image.
-Simply run the following command and you will be able to run TTS without installing it.
+You can also try out Coqui TTS without installation with the docker image.
+Simply run the following command and you will be able to run TTS:
 
 ```bash
-docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
+docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/idiap/coqui-tts-cpu
 python3 TTS/server/server.py --list_models #To get the list of available models
 python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
 ```
@@ -193,10 +182,10 @@ More details about the docker images (like GPU support) can be found
 
 
 ## Synthesizing speech by 🐸TTS
-
+<!-- start inference -->
 ### 🐍 Python API
 
-#### Running a multi-speaker and multi-lingual model
+#### Multi-speaker and multi-lingual model
 
 ```python
 import torch
@@ -208,44 +197,67 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 # List available 🐸TTS models
 print(TTS().list_models())
 
-# Init TTS
+# Initialize TTS
 tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 
+# List speakers
+print(tts.speakers)
+
 # Run TTS
-# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
-# Text to speech list of amplitude values as output
-wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
-# Text to speech to a file
-tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
+# ❗ XTTS supports both, but many models allow only one of the `speaker` and
+# `speaker_wav` arguments
+
+# TTS with list of amplitude values as output, clone the voice from `speaker_wav`
+wav = tts.tts(
+  text="Hello world!",
+  speaker_wav="my/cloning/audio.wav",
+  language="en"
+)
+
+# TTS to a file, use a preset speaker
+tts.tts_to_file(
+  text="Hello world!",
+  speaker="Craig Gutsy",
+  language="en",
+  file_path="output.wav"
+)
 ```
 
-#### Running a single speaker model
+#### Single speaker model
 
 ```python
-# Init TTS with the target model name
-tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False).to(device)
+# Initialize TTS with the target model name
+tts = TTS("tts_models/de/thorsten/tacotron2-DDC").to(device)
 
 # Run TTS
 tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
-
-# Example voice cloning with YourTTS in English, French and Portuguese
-tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
-tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
-tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
-tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
 ```
 
-#### Example voice conversion
+#### Voice conversion (VC)
 
-Converting the voice in `source_wav` to the voice of `target_wav`
+Converting the voice in `source_wav` to the voice of `target_wav`:
 
 ```python
-tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda")
-tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
+tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda")
+tts.voice_conversion_to_file(
+  source_wav="my/source.wav",
+  target_wav="my/target.wav",
+  file_path="output.wav"
+)
 ```
 
-#### Example voice cloning together with the voice conversion model.
-This way, you can clone voices by using any model in 🐸TTS.
+Other available voice conversion models:
+- `voice_conversion_models/multilingual/multi-dataset/knnvc`
+- `voice_conversion_models/multilingual/multi-dataset/openvoice_v1`
+- `voice_conversion_models/multilingual/multi-dataset/openvoice_v2`
+
+For more details, see the
+[documentation](https://coqui-tts.readthedocs.io/en/latest/vc.html).
+
+#### Voice cloning by combining single speaker TTS model with the default VC model
+
+This way, you can clone voices by using any model in 🐸TTS. The FreeVC model is
+used for voice conversion after synthesizing speech.
 
 ```python
 
@@ -257,7 +269,7 @@ tts.tts_with_vc_to_file(
 )
 ```
 
-#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
+#### TTS using Fairseq models in ~1100 languages 🤯
 For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
 You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
 and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
@@ -271,145 +283,126 @@ api.tts_to_file(
 )
 ```
 
-### Command-line `tts`
+### Command-line interface `tts`
 
 <!-- begin-tts-readme -->
 
-Synthesize speech on command line.
+Synthesize speech on the command line.
 
 You can either use your trained model or choose a model from the provided list.
 
-If you don't specify any models, then it uses LJSpeech based English model.
-
-#### Single Speaker Models
-
 - List provided models:
 
+  ```sh
+  tts --list_models
   ```
-  $ tts --list_models
-  ```
-
-- Get model info (for both tts_models and vocoder_models):
-
-  - Query by type/name:
-    The model_info_by_name uses the name as it from the --list_models.
-    ```
-    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
-    ```
-    For example:
-    ```
-    $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
-    $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
-    ```
-  - Query by type/idx:
-    The model_query_idx uses the corresponding idx from --list_models.
 
-    ```
-    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
-    ```
-
-    For example:
-
-    ```
-    $ tts --model_info_by_idx tts_models/3
-    ```
+- Get model information. Use the names obtained from `--list_models`.
+  ```sh
+  tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
+  ```
+  For example:
+  ```sh
+  tts --model_info_by_name tts_models/tr/common-voice/glow-tts
+  tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
+  ```
 
-  - Query info for model info by full name:
-    ```
-    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
-    ```
+#### Single speaker models
 
-- Run TTS with default models:
+- Run TTS with the default model (`tts_models/en/ljspeech/tacotron2-DDC`):
 
-  ```
-  $ tts --text "Text for TTS" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" --out_path output/path/speech.wav
   ```
 
 - Run TTS and pipe out the generated TTS wav file data:
 
-  ```
-  $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
+  ```sh
+  tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
   ```
 
 - Run a TTS model with its default vocoder model:
 
-  ```
-  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \
+      --model_name "<model_type>/<language>/<dataset>/<model_name>" \
+      --out_path output/path/speech.wav
   ```
 
   For example:
 
-  ```
-  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \
+      --model_name "tts_models/en/ljspeech/glow-tts" \
+      --out_path output/path/speech.wav
   ```
 
-- Run with specific TTS and vocoder models from the list:
+- Run with specific TTS and vocoder models from the list. Note that not every vocoder is compatible with every TTS model.
 
-  ```
-  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \
+      --model_name "<model_type>/<language>/<dataset>/<model_name>" \
+      --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" \
+      --out_path output/path/speech.wav
   ```
 
   For example:
 
-  ```
-  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \
+      --model_name "tts_models/en/ljspeech/glow-tts" \
+      --vocoder_name "vocoder_models/en/ljspeech/univnet" \
+      --out_path output/path/speech.wav
   ```
 
-- Run your own TTS model (Using Griffin-Lim Vocoder):
+- Run your own TTS model (using Griffin-Lim Vocoder):
 
-  ```
-  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \
+      --model_path path/to/model.pth \
+      --config_path path/to/config.json \
+      --out_path output/path/speech.wav
   ```
 
 - Run your own TTS and Vocoder models:
 
-  ```
-  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
-      --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
+  ```sh
+  tts --text "Text for TTS" \
+      --model_path path/to/model.pth \
+      --config_path path/to/config.json \
+      --out_path output/path/speech.wav \
+      --vocoder_path path/to/vocoder.pth \
+      --vocoder_config_path path/to/vocoder_config.json
   ```
 
-#### Multi-speaker Models
+#### Multi-speaker models
 
-- List the available speakers and choose a <speaker_id> among them:
+- List the available speakers and choose a `<speaker_id>` among them:
 
-  ```
-  $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
+  ```sh
+  tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
   ```
 
 - Run the multi-speaker TTS model with the target speaker ID:
 
-  ```
-  $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
+  ```sh
+  tts --text "Text for TTS." --out_path output/path/speech.wav \
+      --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
   ```
 
 - Run your own multi-speaker TTS model:
 
-  ```
-  $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+  ```sh
+  tts --text "Text for TTS" --out_path output/path/speech.wav \
+      --model_path path/to/model.pth --config_path path/to/config.json \
+      --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
   ```
 
-### Voice Conversion Models
+#### Voice conversion models
 
-```
-$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
+```sh
+tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" \
+    --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
 ```
 
 <!-- end-tts-readme -->
-
-## Directory Structure
-```
-|- notebooks/       (Jupyter Notebooks for model evaluation, parameter selection and data analysis.)
-|- utils/           (common utilities.)
-|- TTS
-    |- bin/             (folder for all the executables.)
-      |- train*.py                  (train your target model.)
-      |- ...
-    |- tts/             (text to speech models)
-        |- layers/          (model layer definitions)
-        |- models/          (model definitions)
-        |- utils/           (model specific utilities.)
-    |- speaker_encoder/ (Speaker Encoder models.)
-        |- (same)
-    |- vocoder/         (Vocoder models.)
-        |- (same)
-```
diff --git a/TTS/.models.json b/TTS/.models.json
index 1a12e8c8a3..4cc3344167 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -803,6 +803,22 @@
                     "license": "apache 2.0"
                 }
             },
+            "librispeech100": {
+                "wavlm-hifigan": {
+                    "description": "HiFiGAN vocoder for WavLM features from kNN-VC",
+                    "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan.zip",
+                    "commit": "cfba7e0",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT"
+                },
+                "wavlm-hifigan_prematched": {
+                    "description": "Prematched HiFiGAN vocoder for WavLM features from kNN-VC",
+                    "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan_prematched.zip",
+                    "commit": "cfba7e0",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT"
+                }
+            },
             "ljspeech": {
                 "multiband-melgan": {
                     "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
@@ -943,10 +959,42 @@
                 "freevc24": {
                     "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
                     "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
+                    "default_vocoder": null,
                     "author": "Jing-Yi Li @OlaWod",
                     "license": "MIT",
                     "commit": null
                 }
+            },
+            "multi-dataset": {
+                "knnvc": {
+                    "description": "kNN-VC model from https://github.com/bshall/knn-vc",
+                    "default_vocoder": "vocoder_models/en/librispeech100/wavlm-hifigan_prematched",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT",
+                    "commit": null
+                },
+                "openvoice_v1": {
+                    "hf_url": [
+                        "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/config.json",
+                        "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth"
+                    ],
+                    "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
+                    "default_vocoder": null,
+                    "author": "MyShell.ai",
+                    "license": "MIT",
+                    "commit": null
+                },
+                "openvoice_v2": {
+                    "hf_url": [
+                        "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/config.json",
+                        "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/checkpoint.pth"
+                    ],
+                    "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
+                    "default_vocoder": null,
+                    "author": "MyShell.ai",
+                    "license": "MIT",
+                    "commit": null
+                }
             }
         }
     }
diff --git a/TTS/__init__.py b/TTS/__init__.py
index 8e93c9b5db..d270e09e22 100644
--- a/TTS/__init__.py
+++ b/TTS/__init__.py
@@ -4,6 +4,15 @@
 
 __version__ = importlib.metadata.version("coqui-tts")
 
+if "coqpit" in importlib.metadata.packages_distributions().get("coqpit", []):
+    msg = (
+        "coqui-tts switched to a forked version of Coqpit, but you still have the original "
+        "package installed. Run the following to avoid conflicts:\n"
+        "  pip uninstall coqpit\n"
+        "  pip install coqpit-config"
+    )
+    raise ImportError(msg)
+
 
 if is_pytorch_at_least_2_4():
     import _codecs
diff --git a/TTS/api.py b/TTS/api.py
index 250ed1a0d9..3db1e25b11 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -1,3 +1,5 @@
+"""Coqui TTS Python API."""
+
 import logging
 import tempfile
 import warnings
@@ -6,7 +8,6 @@
 from torch import nn
 
 from TTS.config import load_config
-from TTS.utils.audio.numpy_transforms import save_wav
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
 
@@ -19,13 +20,19 @@ class TTS(nn.Module):
     def __init__(
         self,
         model_name: str = "",
-        model_path: str = None,
-        config_path: str = None,
-        vocoder_path: str = None,
-        vocoder_config_path: str = None,
+        *,
+        model_path: str | None = None,
+        config_path: str | None = None,
+        vocoder_name: str | None = None,
+        vocoder_path: str | None = None,
+        vocoder_config_path: str | None = None,
+        encoder_path: str | None = None,
+        encoder_config_path: str | None = None,
+        speakers_file_path: str | None = None,
+        language_ids_file_path: str | None = None,
         progress_bar: bool = True,
-        gpu=False,
-    ):
+        gpu: bool = False,
+    ) -> None:
         """🐸TTS python interface that allows to load and use the released models.
 
         Example with a multi-speaker model:
@@ -35,66 +42,82 @@ def __init__(
             >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
 
         Example with a single-speaker model:
-            >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
+            >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False)
             >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
 
         Example loading a model from a path:
-            >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
+            >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False)
             >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
 
         Example voice cloning with YourTTS in English, French and Portuguese:
-            >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
+            >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to("cuda")
             >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
             >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
             >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
 
         Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
-            >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
+            >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False).to("cuda")
             >>> tts.tts_to_file("This is a test.", file_path="output.wav")
 
         Args:
             model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
             model_path (str, optional): Path to the model checkpoint. Defaults to None.
             config_path (str, optional): Path to the model config. Defaults to None.
+            vocoder_name (str, optional): Pre-trained vocoder to use. Defaults to None, i.e. using the default vocoder.
             vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
             vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
-            progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
-            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+            encoder_path: Path to speaker encoder checkpoint. Default to None.
+            encoder_config_path: Path to speaker encoder config file. Defaults to None.
+            speakers_file_path: JSON file for multi-speaker model. Defaults to None.
+            language_ids_file_path: JSON file for multilingual model. Defaults to None
+            progress_bar (bool, optional): Whether to print a progress bar while downloading a model. Defaults to True.
+            gpu (bool, optional): Enable/disable GPU. Defaults to False. DEPRECATED, use TTS(...).to("cuda")
         """
         super().__init__()
         self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar)
         self.config = load_config(config_path) if config_path else None
-        self.synthesizer = None
-        self.voice_converter = None
+        self.synthesizer: Synthesizer | None = None
+        self.voice_converter: Synthesizer | None = None
         self.model_name = ""
+
+        self.vocoder_path = vocoder_path
+        self.vocoder_config_path = vocoder_config_path
+        self.encoder_path = encoder_path
+        self.encoder_config_path = encoder_config_path
+        self.speakers_file_path = speakers_file_path
+        self.language_ids_file_path = language_ids_file_path
+
         if gpu:
             warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
 
         if model_name is not None and len(model_name) > 0:
             if "tts_models" in model_name:
-                self.load_tts_model_by_name(model_name, gpu)
+                self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
             elif "voice_conversion_models" in model_name:
-                self.load_vc_model_by_name(model_name, gpu)
+                self.load_vc_model_by_name(model_name, vocoder_name, gpu=gpu)
+            # To allow just TTS("xtts")
             else:
-                self.load_model_by_name(model_name, gpu)
+                self.load_model_by_name(model_name, vocoder_name, gpu=gpu)
 
         if model_path:
-            self.load_tts_model_by_path(
-                model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
-            )
+            self.load_tts_model_by_path(model_path, config_path, gpu=gpu)
 
     @property
-    def models(self):
+    def models(self) -> list[str]:
         return self.manager.list_tts_models()
 
     @property
-    def is_multi_speaker(self):
-        if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
+    def is_multi_speaker(self) -> bool:
+        if (
+            self.synthesizer is not None
+            and hasattr(self.synthesizer.tts_model, "speaker_manager")
+            and self.synthesizer.tts_model.speaker_manager
+        ):
             return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
         return False
 
     @property
-    def is_multi_lingual(self):
+    def is_multi_lingual(self) -> bool:
         # Not sure what sets this to None, but applied a fix to prevent crashing.
         if (
             isinstance(self.model_name, str)
@@ -103,31 +126,37 @@ def is_multi_lingual(self):
             and ("xtts" in self.config.model or "languages" in self.config and len(self.config.languages) > 1)
         ):
             return True
-        if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
+        if (
+            self.synthesizer is not None
+            and hasattr(self.synthesizer.tts_model, "language_manager")
+            and self.synthesizer.tts_model.language_manager
+        ):
             return self.synthesizer.tts_model.language_manager.num_languages > 1
         return False
 
     @property
-    def speakers(self):
+    def speakers(self) -> list[str]:
         if not self.is_multi_speaker:
             return None
         return self.synthesizer.tts_model.speaker_manager.speaker_names
 
     @property
-    def languages(self):
+    def languages(self) -> list[str]:
         if not self.is_multi_lingual:
             return None
         return self.synthesizer.tts_model.language_manager.language_names
 
     @staticmethod
-    def get_models_file_path():
+    def get_models_file_path() -> Path:
         return Path(__file__).parent / ".models.json"
 
     @staticmethod
-    def list_models():
+    def list_models() -> list[str]:
         return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False).list_models()
 
-    def download_model_by_name(self, model_name: str):
+    def download_model_by_name(
+        self, model_name: str, vocoder_name: str | None = None
+    ) -> tuple[Path | None, Path | None, Path | None, Path | None, Path | None]:
         model_path, config_path, model_item = self.manager.download_model(model_name)
         if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
             # return model directory if there are multiple files
@@ -135,19 +164,27 @@ def download_model_by_name(self, model_name: str):
             return None, None, None, None, model_path
         if model_item.get("default_vocoder") is None:
             return model_path, config_path, None, None, None
-        vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
+        if vocoder_name is None:
+            vocoder_name = model_item["default_vocoder"]
+        vocoder_path, vocoder_config_path = None, None
+        # A local vocoder model will take precedence if already specified in __init__
+        if model_item["model_type"] == "tts_models":
+            vocoder_path = self.vocoder_path
+            vocoder_config_path = self.vocoder_config_path
+        if vocoder_path is None or vocoder_config_path is None:
+            vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
         return model_path, config_path, vocoder_path, vocoder_config_path, None
 
-    def load_model_by_name(self, model_name: str, gpu: bool = False):
+    def load_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None:
         """Load one of the 🐸TTS models by name.
 
         Args:
             model_name (str): Model name to load. You can list models by ```tts.models```.
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
-        self.load_tts_model_by_name(model_name, gpu)
+        self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
 
-    def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
+    def load_vc_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None:
         """Load one of the voice conversion models by name.
 
         Args:
@@ -155,10 +192,19 @@ def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
         self.model_name = model_name
-        model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
-        self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name, vocoder_name
+        )
+        self.voice_converter = Synthesizer(
+            vc_checkpoint=model_path,
+            vc_config=config_path,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
+            model_dir=model_dir,
+            use_cuda=gpu,
+        )
 
-    def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
+    def load_tts_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None:
         """Load one of 🐸TTS models by name.
 
         Args:
@@ -170,7 +216,9 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
         self.synthesizer = None
         self.model_name = model_name
 
-        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(model_name)
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name, vocoder_name
+        )
 
         # init synthesizer
         # None values are fetch from the model
@@ -181,15 +229,13 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
             tts_languages_file=None,
             vocoder_checkpoint=vocoder_path,
             vocoder_config=vocoder_config_path,
-            encoder_checkpoint=None,
-            encoder_config=None,
+            encoder_checkpoint=self.encoder_path,
+            encoder_config=self.encoder_config_path,
             model_dir=model_dir,
             use_cuda=gpu,
         )
 
-    def load_tts_model_by_path(
-        self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
-    ):
+    def load_tts_model_by_path(self, model_path: str, config_path: str, *, gpu: bool = False) -> None:
         """Load a model from a path.
 
         Args:
@@ -203,22 +249,21 @@ def load_tts_model_by_path(
         self.synthesizer = Synthesizer(
             tts_checkpoint=model_path,
             tts_config_path=config_path,
-            tts_speakers_file=None,
-            tts_languages_file=None,
-            vocoder_checkpoint=vocoder_path,
-            vocoder_config=vocoder_config,
-            encoder_checkpoint=None,
-            encoder_config=None,
+            tts_speakers_file=self.speakers_file_path,
+            tts_languages_file=self.language_ids_file_path,
+            vocoder_checkpoint=self.vocoder_path,
+            vocoder_config=self.vocoder_config_path,
+            encoder_checkpoint=self.encoder_path,
+            encoder_config=self.encoder_config_path,
             use_cuda=gpu,
         )
 
     def _check_arguments(
         self,
-        speaker: str = None,
-        language: str = None,
-        speaker_wav: str = None,
-        emotion: str = None,
-        speed: float = None,
+        speaker: str | None = None,
+        language: str | None = None,
+        speaker_wav: str | None = None,
+        emotion: str | None = None,
         **kwargs,
     ) -> None:
         """Check if the arguments are valid for the model."""
@@ -231,17 +276,16 @@ def _check_arguments(
             raise ValueError("Model is not multi-speaker but `speaker` is provided.")
         if not self.is_multi_lingual and language is not None:
             raise ValueError("Model is not multi-lingual but `language` is provided.")
-        if emotion is not None and speed is not None:
-            raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
+        if emotion is not None:
+            raise ValueError("Emotion can only be used with Coqui Studio models. Which is discontinued.")
 
     def tts(
         self,
         text: str,
-        speaker: str = None,
-        language: str = None,
-        speaker_wav: str = None,
-        emotion: str = None,
-        speed: float = None,
+        speaker: str | None = None,
+        language: str | None = None,
+        speaker_wav: str | None = None,
+        emotion: str | None = None,
         split_sentences: bool = True,
         **kwargs,
     ):
@@ -260,9 +304,6 @@ def tts(
                 Defaults to None.
             emotion (str, optional):
                 Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
-            speed (float, optional):
-                Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
-                Defaults to None.
             split_sentences (bool, optional):
                 Split text into sentences, synthesize them separately and concatenate the file audio.
                 Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
@@ -270,18 +311,12 @@ def tts(
             kwargs (dict, optional):
                 Additional arguments for the model.
         """
-        self._check_arguments(
-            speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
-        )
+        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, **kwargs)
         wav = self.synthesizer.tts(
             text=text,
             speaker_name=speaker,
             language_name=language,
             speaker_wav=speaker_wav,
-            reference_wav=None,
-            style_wav=None,
-            style_text=None,
-            reference_speaker_name=None,
             split_sentences=split_sentences,
             **kwargs,
         )
@@ -290,16 +325,15 @@ def tts(
     def tts_to_file(
         self,
         text: str,
-        speaker: str = None,
-        language: str = None,
-        speaker_wav: str = None,
-        emotion: str = None,
-        speed: float = 1.0,
+        speaker: str | None = None,
+        language: str | None = None,
+        speaker_wav: str | None = None,
+        emotion: str | None = None,
         pipe_out=None,
         file_path: str = "output.wav",
         split_sentences: bool = True,
         **kwargs,
-    ):
+    ) -> str:
         """Convert text to speech.
 
         Args:
@@ -316,8 +350,6 @@ def tts_to_file(
                 Defaults to None.
             emotion (str, optional):
                 Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
-            speed (float, optional):
-                Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
             pipe_out (BytesIO, optional):
                 Flag to stdout the generated TTS wav file for shell pipe.
             file_path (str, optional):
@@ -345,7 +377,7 @@ def tts_to_file(
     def voice_conversion(
         self,
         source_wav: str,
-        target_wav: str,
+        target_wav: str | list[str],
     ):
         """Voice conversion with FreeVC. Convert source wav to target speaker.
 
@@ -355,15 +387,18 @@ def voice_conversion(
             target_wav (str):`
                 Path to the target wav file.
         """
-        wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
-        return wav
+        if self.voice_converter is None:
+            msg = "The selected model does not support voice conversion."
+            raise RuntimeError(msg)
+        return self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
 
     def voice_conversion_to_file(
         self,
         source_wav: str,
-        target_wav: str,
+        target_wav: str | list[str],
         file_path: str = "output.wav",
-    ):
+        pipe_out=None,
+    ) -> str:
         """Voice conversion with FreeVC. Convert source wav to target speaker.
 
         Args:
@@ -373,17 +408,20 @@ def voice_conversion_to_file(
                 Path to the target wav file.
             file_path (str, optional):
                 Output file path. Defaults to "output.wav".
+            pipe_out (BytesIO, optional):
+                Flag to stdout the generated TTS wav file for shell pipe.
         """
         wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
-        save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
+        self.voice_converter.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
         return file_path
 
     def tts_with_vc(
         self,
         text: str,
-        language: str = None,
-        speaker_wav: str = None,
-        speaker: str = None,
+        *,
+        language: str | None = None,
+        speaker_wav: str | list[str],
+        speaker: str | None = None,
         split_sentences: bool = True,
     ):
         """Convert text to speech with voice conversion.
@@ -423,12 +461,14 @@ def tts_with_vc(
     def tts_with_vc_to_file(
         self,
         text: str,
-        language: str = None,
-        speaker_wav: str = None,
+        *,
+        language: str | None = None,
+        speaker_wav: str | list[str],
         file_path: str = "output.wav",
-        speaker: str = None,
+        speaker: str | None = None,
         split_sentences: bool = True,
-    ):
+        pipe_out=None,
+    ) -> str:
         """Convert text to speech with voice conversion and save to file.
 
         Check `tts_with_vc` for more details.
@@ -451,8 +491,11 @@ def tts_with_vc_to_file(
                 Split text into sentences, synthesize them separately and concatenate the file audio.
                 Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
                 applicable to the 🐸TTS models. Defaults to True.
+            pipe_out (BytesIO, optional):
+                Flag to stdout the generated TTS wav file for shell pipe.
         """
         wav = self.tts_with_vc(
             text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
         )
-        save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
+        self.voice_converter.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
+        return file_path
diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py
index 127199186b..8d7a2633a0 100644
--- a/TTS/bin/compute_attention_masks.py
+++ b/TTS/bin/compute_attention_masks.py
@@ -2,6 +2,7 @@
 import importlib
 import logging
 import os
+import sys
 from argparse import RawTextHelpFormatter
 
 import numpy as np
@@ -18,7 +19,7 @@
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     # pylint: disable=bad-option-value
     parser = argparse.ArgumentParser(
@@ -80,7 +81,7 @@
     num_chars = len(phonemes) if C.use_phonemes else len(symbols)
     # TODO: handle multi-speaker
     model = setup_model(C)
-    model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
+    model, _ = load_checkpoint(model, args.model_path, use_cuda=args.use_cuda, eval=True)
 
     # data loader
     preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
@@ -112,7 +113,7 @@
 
     # compute attentions
     file_paths = []
-    with torch.no_grad():
+    with torch.inference_mode():
         for data in tqdm(loader):
             # setup input data
             text_input = data[0]
diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
index 1bdb8d733c..d450e26fba 100644
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@@ -1,6 +1,7 @@
 import argparse
 import logging
 import os
+import sys
 from argparse import RawTextHelpFormatter
 
 import torch
@@ -14,6 +15,88 @@
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 
+def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
+        """
+        Example runs:
+        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
+
+        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
+        """,
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        help="Path to model checkpoint file. It defaults to the released speaker encoder.",
+        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+    )
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        help="Path to model config file. It defaults to the released speaker encoder config.",
+        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+    )
+    parser.add_argument(
+        "--config_dataset_path",
+        type=str,
+        help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
+        default=None,
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        help="Path for output `pth` or `json` file.",
+        default="speakers.pth",
+    )
+    parser.add_argument(
+        "--old_file",
+        type=str,
+        help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
+        default=None,
+    )
+    parser.add_argument(
+        "--old_append",
+        help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
+    parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
+    parser.add_argument(
+        "--formatter_name",
+        type=str,
+        help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        help="Path to the dataset. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--meta_file_train",
+        type=str,
+        help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--meta_file_val",
+        type=str,
+        help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    return parser.parse_args()
+
+
 def compute_embeddings(
     model_path,
     config_path,
@@ -101,88 +184,9 @@ def compute_embeddings(
         print("Speaker embeddings saved at:", mapping_file_path)
 
 
-if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-
-    parser = argparse.ArgumentParser(
-        description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
-        """
-        Example runs:
-        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
-
-        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
-        """,
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        help="Path to model checkpoint file. It defaults to the released speaker encoder.",
-        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
-    )
-    parser.add_argument(
-        "--config_path",
-        type=str,
-        help="Path to model config file. It defaults to the released speaker encoder config.",
-        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
-    )
-    parser.add_argument(
-        "--config_dataset_path",
-        type=str,
-        help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
-        default=None,
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        help="Path for output `pth` or `json` file.",
-        default="speakers.pth",
-    )
-    parser.add_argument(
-        "--old_file",
-        type=str,
-        help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
-        default=None,
-    )
-    parser.add_argument(
-        "--old_append",
-        help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
-        default=False,
-        action="store_true",
-    )
-    parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
-    parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
-    parser.add_argument(
-        "--formatter_name",
-        type=str,
-        help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--dataset_path",
-        type=str,
-        help="Path to the dataset. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--meta_file_train",
-        type=str,
-        help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--meta_file_val",
-        type=str,
-        help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    args = parser.parse_args()
+def main(arg_list: list[str] | None = None):
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
+    args = parse_args(arg_list)
 
     compute_embeddings(
         args.model_path,
@@ -199,3 +203,7 @@ def compute_embeddings(
         disable_cuda=args.disable_cuda,
         no_eval=args.no_eval,
     )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py
index dc5423a691..1da7a092fb 100755
--- a/TTS/bin/compute_statistics.py
+++ b/TTS/bin/compute_statistics.py
@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 
 import argparse
 import glob
 import logging
 import os
+import sys
 
 import numpy as np
 from tqdm import tqdm
@@ -16,10 +16,7 @@
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 
-def main():
-    """Run preprocessing process."""
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-
+def parse_args(arg_list: list[str] | None) -> tuple[argparse.Namespace, list[str]]:
     parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
     parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
     parser.add_argument("out_path", type=str, help="save path (directory and filename).")
@@ -29,7 +26,13 @@ def main():
         required=False,
         help="folder including the target set of wavs overriding dataset config.",
     )
-    args, overrides = parser.parse_known_args()
+    return parser.parse_known_args(arg_list)
+
+
+def main(arg_list: list[str] | None = None):
+    """Run preprocessing process."""
+    setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter())
+    args, overrides = parse_args(arg_list)
 
     CONFIG = load_config(args.config_path)
     CONFIG.parse_known_args(overrides, relaxed_parser=True)
@@ -94,6 +97,7 @@ def main():
     stats["audio_config"] = CONFIG.audio.to_dict()
     np.save(output_file_path, stats, allow_pickle=True)
     print(f" > stats saved to {output_file_path}")
+    sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
index 711c8221db..701c7d8e82 100644
--- a/TTS/bin/eval_encoder.py
+++ b/TTS/bin/eval_encoder.py
@@ -1,5 +1,6 @@
 import argparse
 import logging
+import sys
 from argparse import RawTextHelpFormatter
 
 import torch
@@ -53,7 +54,7 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):
 
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     parser = argparse.ArgumentParser(
         description="""Compute the accuracy of the encoder.\n\n"""
diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
index 86a4dce177..be9387f015 100755
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@@ -3,7 +3,8 @@
 
 import argparse
 import logging
-import os
+import sys
+from pathlib import Path
 
 import numpy as np
 import torch
@@ -12,8 +13,10 @@
 from trainer.generic_utils import count_parameters
 
 from TTS.config import load_config
+from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.datasets import TTSDataset, load_tts_samples
 from TTS.tts.models import setup_model
+from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
@@ -23,56 +26,66 @@
 use_cuda = torch.cuda.is_available()
 
 
-def setup_loader(ap, r):
-    tokenizer, _ = TTSTokenizer.init_from_config(c)
+def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
+    parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
+    parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
+    parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
+    parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
+    parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
+    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
+    return parser.parse_args(arg_list)
+
+
+def setup_loader(config: BaseTTSConfig, ap: AudioProcessor, r, speaker_manager: SpeakerManager, samples) -> DataLoader:
+    tokenizer, _ = TTSTokenizer.init_from_config(config)
     dataset = TTSDataset(
         outputs_per_step=r,
         compute_linear_spec=False,
-        samples=meta_data,
+        samples=samples,
         tokenizer=tokenizer,
         ap=ap,
         batch_group_size=0,
-        min_text_len=c.min_text_len,
-        max_text_len=c.max_text_len,
-        min_audio_len=c.min_audio_len,
-        max_audio_len=c.max_audio_len,
-        phoneme_cache_path=c.phoneme_cache_path,
+        min_text_len=config.min_text_len,
+        max_text_len=config.max_text_len,
+        min_audio_len=config.min_audio_len,
+        max_audio_len=config.max_audio_len,
+        phoneme_cache_path=config.phoneme_cache_path,
         precompute_num_workers=0,
         use_noise_augment=False,
-        speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
-        d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
+        speaker_id_mapping=speaker_manager.name_to_id if config.use_speaker_embedding else None,
+        d_vector_mapping=speaker_manager.embeddings if config.use_d_vector_file else None,
     )
 
-    if c.use_phonemes and c.compute_input_seq_cache:
+    if config.use_phonemes and config.compute_input_seq_cache:
         # precompute phonemes to have a better estimate of sequence lengths.
-        dataset.compute_input_seq(c.num_loader_workers)
+        dataset.compute_input_seq(config.num_loader_workers)
     dataset.preprocess_samples()
 
-    loader = DataLoader(
+    return DataLoader(
         dataset,
-        batch_size=c.batch_size,
+        batch_size=config.batch_size,
         shuffle=False,
         collate_fn=dataset.collate_fn,
         drop_last=False,
         sampler=None,
-        num_workers=c.num_loader_workers,
+        num_workers=config.num_loader_workers,
         pin_memory=False,
     )
-    return loader
 
 
-def set_filename(wav_path, out_path):
-    wav_file = os.path.basename(wav_path)
-    file_name = wav_file.split(".")[0]
-    os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
-    os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
-    os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
-    os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
-    wavq_path = os.path.join(out_path, "quant", file_name)
-    mel_path = os.path.join(out_path, "mel", file_name)
-    wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
-    wav_path = os.path.join(out_path, "wav", file_name + ".wav")
-    return file_name, wavq_path, mel_path, wav_gl_path, wav_path
+def set_filename(wav_path: str, out_path: Path) -> tuple[Path, Path, Path, Path]:
+    wav_name = Path(wav_path).stem
+    (out_path / "quant").mkdir(exist_ok=True, parents=True)
+    (out_path / "mel").mkdir(exist_ok=True, parents=True)
+    (out_path / "wav_gl").mkdir(exist_ok=True, parents=True)
+    (out_path / "wav").mkdir(exist_ok=True, parents=True)
+    wavq_path = out_path / "quant" / wav_name
+    mel_path = out_path / "mel" / wav_name
+    wav_gl_path = out_path / "wav_gl" / f"{wav_name}.wav"
+    out_wav_path = out_path / "wav" / f"{wav_name}.wav"
+    return wavq_path, mel_path, wav_gl_path, out_wav_path
 
 
 def format_data(data):
@@ -114,18 +127,18 @@ def format_data(data):
     )
 
 
-@torch.no_grad()
+@torch.inference_mode()
 def inference(
-    model_name,
-    model,
-    ap,
+    model_name: str,
+    model: BaseTTS,
+    ap: AudioProcessor,
     text_input,
     text_lengths,
     mel_input,
     mel_lengths,
     speaker_ids=None,
     d_vectors=None,
-):
+) -> np.ndarray:
     if model_name == "glow_tts":
         speaker_c = None
         if speaker_ids is not None:
@@ -140,9 +153,9 @@ def inference(
             aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
         )
         model_output = outputs["model_outputs"]
-        model_output = model_output.detach().cpu().numpy()
+        return model_output.detach().cpu().numpy()
 
-    elif "tacotron" in model_name:
+    if "tacotron" in model_name:
         aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
         outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
         postnet_outputs = outputs["model_outputs"]
@@ -153,16 +166,24 @@ def inference(
             for b in range(postnet_outputs.shape[0]):
                 postnet_output = postnet_outputs[b]
                 mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
-            model_output = torch.stack(mel_specs).cpu().numpy()
-
-        elif model_name == "tacotron2":
-            model_output = postnet_outputs.detach().cpu().numpy()
-    return model_output
+            return torch.stack(mel_specs).cpu().numpy()
+        if model_name == "tacotron2":
+            return postnet_outputs.detach().cpu().numpy()
+    msg = f"Model not supported: {model_name}"
+    raise ValueError(msg)
 
 
 def extract_spectrograms(
-    data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
-):
+    model_name: str,
+    data_loader: DataLoader,
+    model: BaseTTS,
+    ap: AudioProcessor,
+    output_path: Path,
+    quantize_bits: int = 0,
+    save_audio: bool = False,
+    debug: bool = False,
+    metadata_name: str = "metadata.txt",
+) -> None:
     model.eval()
     export_metadata = []
     for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
@@ -181,7 +202,7 @@ def extract_spectrograms(
         ) = format_data(data)
 
         model_output = inference(
-            c.model.lower(),
+            model_name,
             model,
             ap,
             text_input,
@@ -195,7 +216,7 @@ def extract_spectrograms(
         for idx in range(text_input.shape[0]):
             wav_file_path = item_idx[idx]
             wav = ap.load_wav(wav_file_path)
-            _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
+            wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
 
             # quantize and save wav
             if quantize_bits > 0:
@@ -217,74 +238,67 @@ def extract_spectrograms(
                 wav = ap.inv_melspectrogram(mel)
                 ap.save_wav(wav, wav_gl_path)
 
-    with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
+    with (output_path / metadata_name).open("w") as f:
         for data in export_metadata:
-            f.write(f"{data[0]}|{data[1]+'.npy'}\n")
+            f.write(f"{data[0] / data[1]}.npy\n")
 
 
-def main(args):  # pylint: disable=redefined-outer-name
-    # pylint: disable=global-variable-undefined
-    global meta_data, speaker_manager
+def main(arg_list: list[str] | None = None) -> None:
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
+    args = parse_args(arg_list)
+    config = load_config(args.config_path)
+    config.audio.trim_silence = False
 
     # Audio processor
-    ap = AudioProcessor(**c.audio)
+    ap = AudioProcessor(**config.audio)
 
     # load data instances
     meta_data_train, meta_data_eval = load_tts_samples(
-        c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+        config.datasets,
+        eval_split=args.eval,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
     )
 
     # use eval and training partitions
     meta_data = meta_data_train + meta_data_eval
 
     # init speaker manager
-    if c.use_speaker_embedding:
+    if config.use_speaker_embedding:
         speaker_manager = SpeakerManager(data_items=meta_data)
-    elif c.use_d_vector_file:
-        speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
+    elif config.use_d_vector_file:
+        speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file)
     else:
         speaker_manager = None
 
     # setup model
-    model = setup_model(c)
+    model = setup_model(config)
 
     # restore model
-    model.load_checkpoint(c, args.checkpoint_path, eval=True)
+    model.load_checkpoint(config, args.checkpoint_path, eval=True)
 
     if use_cuda:
         model.cuda()
 
     num_params = count_parameters(model)
-    print("\n > Model has {} parameters".format(num_params), flush=True)
+    print(f"\n > Model has {num_params} parameters", flush=True)
     # set r
-    r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
-    own_loader = setup_loader(ap, r)
+    r = 1 if config.model.lower() == "glow_tts" else model.decoder.r
+    own_loader = setup_loader(config, ap, r, speaker_manager, meta_data)
 
     extract_spectrograms(
+        config.model.lower(),
         own_loader,
         model,
         ap,
-        args.output_path,
+        Path(args.output_path),
         quantize_bits=args.quantize_bits,
         save_audio=args.save_audio,
         debug=args.debug,
-        metada_name="metada.txt",
+        metadata_name="metadata.txt",
     )
+    sys.exit(0)
 
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
-    parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
-    parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
-    parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
-    parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
-    parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
-    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
-    args = parser.parse_args()
-
-    c = load_config(args.config_path)
-    c.audio.trim_silence = False
-    main(args)
+    main()
diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py
index 0519d43769..7a7fdf5dd4 100644
--- a/TTS/bin/find_unique_chars.py
+++ b/TTS/bin/find_unique_chars.py
@@ -2,6 +2,7 @@
 
 import argparse
 import logging
+import sys
 from argparse import RawTextHelpFormatter
 
 from TTS.config import load_config
@@ -10,7 +11,7 @@
 
 
 def main():
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     # pylint: disable=bad-option-value
     parser = argparse.ArgumentParser(
diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py
index d99acb9893..40afa1456c 100644
--- a/TTS/bin/find_unique_phonemes.py
+++ b/TTS/bin/find_unique_phonemes.py
@@ -1,8 +1,9 @@
-"""Find all the unique characters in a dataset"""
+"""Find all the unique characters in a dataset."""
 
 import argparse
 import logging
 import multiprocessing
+import sys
 from argparse import RawTextHelpFormatter
 
 from tqdm.contrib.concurrent import process_map
@@ -13,18 +14,13 @@
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 
-def compute_phonemes(item):
+def compute_phonemes(item: dict) -> set[str]:
     text = item["text"]
     ph = phonemizer.phonemize(text).replace("|", "")
     return set(ph)
 
 
-def main():
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-
-    # pylint: disable=W0601
-    global c, phonemizer
-    # pylint: disable=bad-option-value
+def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="""Find all the unique characters or phonemes in a dataset.\n\n"""
         """
@@ -35,13 +31,21 @@ def main():
         formatter_class=RawTextHelpFormatter,
     )
     parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
-    args = parser.parse_args()
+    return parser.parse_args(arg_list)
+
 
-    c = load_config(args.config_path)
+def main(arg_list: list[str] | None = None) -> None:
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
+    global phonemizer
+    args = parse_args(arg_list)
+    config = load_config(args.config_path)
 
     # load all datasets
     train_items, eval_items = load_tts_samples(
-        c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+        config.datasets,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
     )
     items = train_items + eval_items
     print("Num items:", len(items))
@@ -49,13 +53,16 @@ def main():
     language_list = [item["language"] for item in items]
     is_lang_def = all(language_list)
 
-    if not c.phoneme_language or not is_lang_def:
-        raise ValueError("Phoneme language must be defined in config.")
+    if not config.phoneme_language or not is_lang_def:
+        msg = "Phoneme language must be defined in config."
+        raise ValueError(msg)
 
-    if not language_list.count(language_list[0]) == len(language_list):
-        raise ValueError(
-            "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
+    if language_list.count(language_list[0]) != len(language_list):
+        msg = (
+            "Currently, just one phoneme language per config file is supported !! "
+            "Please split the dataset config into different configs and run it individually for each language !!"
         )
+        raise ValueError(msg)
 
     phonemizer = Gruut(language=language_list[0], keep_puncs=True)
 
@@ -73,6 +80,7 @@ def main():
     print(f" > Unique phonemes: {''.join(sorted(phones))}")
     print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
     print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
+    sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py
index edab882db8..f9121d7f77 100755
--- a/TTS/bin/remove_silence_using_vad.py
+++ b/TTS/bin/remove_silence_using_vad.py
@@ -4,6 +4,7 @@
 import multiprocessing
 import os
 import pathlib
+import sys
 
 import torch
 from tqdm import tqdm
@@ -77,7 +78,7 @@ def preprocess_audios():
 
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     parser = argparse.ArgumentParser(
         description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end"
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index 20e429df04..00d7530427 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -9,135 +9,132 @@
 from argparse import RawTextHelpFormatter
 
 # pylint: disable=redefined-outer-name, unused-argument
-from pathlib import Path
-
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 logger = logging.getLogger(__name__)
 
 description = """
-Synthesize speech on command line.
+Synthesize speech on the command line.
 
 You can either use your trained model or choose a model from the provided list.
 
-If you don't specify any models, then it uses LJSpeech based English model.
-
-#### Single Speaker Models
-
 - List provided models:
 
+  ```sh
+  tts --list_models
   ```
-  $ tts --list_models
-  ```
-
-- Get model info (for both tts_models and vocoder_models):
-
-  - Query by type/name:
-    The model_info_by_name uses the name as it from the --list_models.
-    ```
-    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
-    ```
-    For example:
-    ```
-    $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
-    $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
-    ```
-  - Query by type/idx:
-    The model_query_idx uses the corresponding idx from --list_models.
-
-    ```
-    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
-    ```
-
-    For example:
 
-    ```
-    $ tts --model_info_by_idx tts_models/3
-    ```
+- Get model information. Use the names obtained from `--list_models`.
+  ```sh
+  tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
+  ```
+  For example:
+  ```sh
+  tts --model_info_by_name tts_models/tr/common-voice/glow-tts
+  tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
+  ```
 
-  - Query info for model info by full name:
-    ```
-    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
-    ```
+#### Single speaker models
 
-- Run TTS with default models:
+- Run TTS with the default model (`tts_models/en/ljspeech/tacotron2-DDC`):
 
-  ```
-  $ tts --text "Text for TTS" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" --out_path output/path/speech.wav
   ```
 
 - Run TTS and pipe out the generated TTS wav file data:
 
-  ```
-  $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
+  ```sh
+  tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
   ```
 
 - Run a TTS model with its default vocoder model:
 
-  ```
-  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \\
+      --model_name "<model_type>/<language>/<dataset>/<model_name>" \\
+      --out_path output/path/speech.wav
   ```
 
   For example:
 
-  ```
-  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \\
+      --model_name "tts_models/en/ljspeech/glow-tts" \\
+      --out_path output/path/speech.wav
   ```
 
-- Run with specific TTS and vocoder models from the list:
+- Run with specific TTS and vocoder models from the list. Note that not every vocoder is compatible with every TTS model.
 
-  ```
-  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \\
+      --model_name "<model_type>/<language>/<dataset>/<model_name>" \\
+      --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" \\
+      --out_path output/path/speech.wav
   ```
 
   For example:
 
-  ```
-  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \\
+      --model_name "tts_models/en/ljspeech/glow-tts" \\
+      --vocoder_name "vocoder_models/en/ljspeech/univnet" \\
+      --out_path output/path/speech.wav
   ```
 
-- Run your own TTS model (Using Griffin-Lim Vocoder):
+- Run your own TTS model (using Griffin-Lim Vocoder):
 
-  ```
-  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \\
+      --model_path path/to/model.pth \\
+      --config_path path/to/config.json \\
+      --out_path output/path/speech.wav
   ```
 
 - Run your own TTS and Vocoder models:
 
-  ```
-  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
-      --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
+  ```sh
+  tts --text "Text for TTS" \\
+      --model_path path/to/model.pth \\
+      --config_path path/to/config.json \\
+      --out_path output/path/speech.wav \\
+      --vocoder_path path/to/vocoder.pth \\
+      --vocoder_config_path path/to/vocoder_config.json
   ```
 
-#### Multi-speaker Models
+#### Multi-speaker models
 
-- List the available speakers and choose a <speaker_id> among them:
+- List the available speakers and choose a `<speaker_id>` among them:
 
-  ```
-  $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
+  ```sh
+  tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
   ```
 
 - Run the multi-speaker TTS model with the target speaker ID:
 
-  ```
-  $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
+  ```sh
+  tts --text "Text for TTS." --out_path output/path/speech.wav \\
+      --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
   ```
 
 - Run your own multi-speaker TTS model:
 
-  ```
-  $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+  ```sh
+  tts --text "Text for TTS" --out_path output/path/speech.wav \\
+      --model_path path/to/model.pth --config_path path/to/config.json \\
+      --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
   ```
 
-### Voice Conversion Models
+#### Voice conversion models
 
-```
-$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
+```sh
+tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" \\
+    --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
 ```
 """
 
 
-def parse_args() -> argparse.Namespace:
+def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
     """Parse arguments."""
     parser = argparse.ArgumentParser(
         description=description.replace("    ```\n", ""),
@@ -253,11 +250,6 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
     )
     # aux args
-    parser.add_argument(
-        "--save_spectogram",
-        action="store_true",
-        help="Save raw spectogram for further (vocoder) processing in out_path.",
-    )
     parser.add_argument(
         "--reference_wav",
         type=str,
@@ -282,13 +274,14 @@ def parse_args() -> argparse.Namespace:
         "--source_wav",
         type=str,
         default=None,
-        help="Original audio file to convert in the voice of the target_wav",
+        help="Original audio file to convert into the voice of the target_wav",
     )
     parser.add_argument(
         "--target_wav",
         type=str,
+        nargs="*",
         default=None,
-        help="Target audio file to convert in the voice of the source_wav",
+        help="Audio file(s) of the target voice into which to convert the source_wav",
     )
 
     parser.add_argument(
@@ -298,7 +291,7 @@ def parse_args() -> argparse.Namespace:
         help="Voice dir for tortoise model",
     )
 
-    args = parser.parse_args()
+    args = parser.parse_args(arg_list)
 
     # print the description if either text or list_models is not set
     check_args = [
@@ -317,20 +310,21 @@ def parse_args() -> argparse.Namespace:
     return args
 
 
-def main():
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-    args = parse_args()
+def main(arg_list: list[str] | None = None) -> None:
+    """Entry point for `tts` command line interface."""
+    args = parse_args(arg_list)
+    stream = sys.stderr if args.pipe_out else sys.stdout
+    setup_logger("TTS", level=logging.INFO, stream=stream, formatter=ConsoleFormatter())
 
     pipe_out = sys.stdout if args.pipe_out else None
 
     with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
         # Late-import to make things load faster
+        from TTS.api import TTS
         from TTS.utils.manage import ModelManager
-        from TTS.utils.synthesizer import Synthesizer
 
         # load model manager
-        path = Path(__file__).parent / "../.models.json"
-        manager = ModelManager(path, progress_bar=args.progress_bar)
+        manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=args.progress_bar)
 
         tts_path = None
         tts_config_path = None
@@ -344,142 +338,100 @@ def main():
         vc_config_path = None
         model_dir = None
 
-        # CASE1 #list : list pre-trained TTS models
+        # 1) List pre-trained TTS models
         if args.list_models:
             manager.list_models()
-            sys.exit()
+            sys.exit(0)
 
-        # CASE2 #info : model info for pre-trained TTS models
+        # 2) Info about pre-trained TTS models (without loading a model)
         if args.model_info_by_idx:
             model_query = args.model_info_by_idx
             manager.model_info_by_idx(model_query)
-            sys.exit()
+            sys.exit(0)
 
         if args.model_info_by_name:
             model_query_full_name = args.model_info_by_name
             manager.model_info_by_full_name(model_query_full_name)
-            sys.exit()
-
-        # CASE3: load pre-trained model paths
-        if args.model_name is not None and not args.model_path:
-            model_path, config_path, model_item = manager.download_model(args.model_name)
-            # tts model
-            if model_item["model_type"] == "tts_models":
-                tts_path = model_path
-                tts_config_path = config_path
-                if args.vocoder_name is None and "default_vocoder" in model_item:
-                    args.vocoder_name = model_item["default_vocoder"]
-
-            # voice conversion model
-            if model_item["model_type"] == "voice_conversion_models":
-                vc_path = model_path
-                vc_config_path = config_path
-
-            # tts model with multiple files to be loaded from the directory path
-            if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
-                model_dir = model_path
-                tts_path = None
-                tts_config_path = None
-                args.vocoder_name = None
-
-        # load vocoder
-        if args.vocoder_name is not None and not args.vocoder_path:
-            vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
-
-        # CASE4: set custom model paths
-        if args.model_path is not None:
-            tts_path = args.model_path
-            tts_config_path = args.config_path
-            speakers_file_path = args.speakers_file_path
-            language_ids_file_path = args.language_ids_file_path
-
-        if args.vocoder_path is not None:
-            vocoder_path = args.vocoder_path
-            vocoder_config_path = args.vocoder_config_path
-
-        if args.encoder_path is not None:
-            encoder_path = args.encoder_path
-            encoder_config_path = args.encoder_config_path
+            sys.exit(0)
 
+        # 3) Load a model for further info or TTS/VC
         device = args.device
         if args.use_cuda:
             device = "cuda"
-
-        # load models
-        synthesizer = Synthesizer(
-            tts_path,
-            tts_config_path,
-            speakers_file_path,
-            language_ids_file_path,
-            vocoder_path,
-            vocoder_config_path,
-            encoder_path,
-            encoder_config_path,
-            vc_path,
-            vc_config_path,
-            model_dir,
-            args.voice_dir,
+        # A local model will take precedence if specified via modeL_path
+        model_name = args.model_name if args.model_path is None else None
+        api = TTS(
+            model_name=model_name,
+            model_path=args.model_path,
+            config_path=args.config_path,
+            vocoder_name=args.vocoder_name,
+            vocoder_path=args.vocoder_path,
+            vocoder_config_path=args.vocoder_config_path,
+            encoder_path=args.encoder_path,
+            encoder_config_path=args.encoder_config_path,
+            speakers_file_path=args.speakers_file_path,
+            language_ids_file_path=args.language_ids_file_path,
+            progress_bar=args.progress_bar,
         ).to(device)
 
         # query speaker ids of a multi-speaker model.
         if args.list_speaker_idxs:
-            if synthesizer.tts_model.speaker_manager is None:
+            if not api.is_multi_speaker:
                 logger.info("Model only has a single speaker.")
-                return
+                sys.exit(0)
             logger.info(
                 "Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
             )
-            logger.info(list(synthesizer.tts_model.speaker_manager.name_to_id.keys()))
-            return
+            logger.info(api.speakers)
+            sys.exit(0)
 
         # query langauge ids of a multi-lingual model.
         if args.list_language_idxs:
-            if synthesizer.tts_model.language_manager is None:
+            if not api.is_multi_lingual:
                 logger.info("Monolingual model.")
-                return
+                sys.exit(0)
             logger.info(
                 "Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
             )
-            logger.info(synthesizer.tts_model.language_manager.name_to_id)
-            return
+            logger.info(api.languages)
+            sys.exit(0)
 
         # check the arguments against a multi-speaker model.
-        if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
+        if api.is_multi_speaker and (not args.speaker_idx and not args.speaker_wav):
             logger.error(
                 "Looks like you use a multi-speaker model. Define `--speaker_idx` to "
                 "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
             )
-            return
+            sys.exit(1)
 
         # RUN THE SYNTHESIS
         if args.text:
             logger.info("Text: %s", args.text)
 
-        # kick it
-        if tts_path is not None:
-            wav = synthesizer.tts(
-                args.text,
-                speaker_name=args.speaker_idx,
-                language_name=args.language_idx,
+        if args.text is not None:
+            api.tts_to_file(
+                text=args.text,
+                speaker=args.speaker_idx,
+                language=args.language_idx,
                 speaker_wav=args.speaker_wav,
+                pipe_out=pipe_out,
+                file_path=args.out_path,
                 reference_wav=args.reference_wav,
                 style_wav=args.capacitron_style_wav,
                 style_text=args.capacitron_style_text,
                 reference_speaker_name=args.reference_speaker_idx,
+                voice_dir=args.voice_dir,
             )
-        elif vc_path is not None:
-            wav = synthesizer.voice_conversion(
+            logger.info("Saved TTS output to %s", args.out_path)
+        elif args.source_wav is not None and args.target_wav is not None:
+            api.voice_conversion_to_file(
                 source_wav=args.source_wav,
                 target_wav=args.target_wav,
+                file_path=args.out_path,
+                pipe_out=pipe_out,
             )
-        elif model_dir is not None:
-            wav = synthesizer.tts(
-                args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
-            )
-
-        # save the results
-        synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
-        logger.info("Saved output to %s", args.out_path)
+            logger.info("Saved VC output to %s", args.out_path)
+        sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
index ba03c42b6d..06189a44c3 100644
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@@ -1,25 +1,30 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
+
+# TODO: use Trainer
 
 import logging
 import os
 import sys
 import time
-import traceback
 import warnings
+from dataclasses import dataclass, field
 
 import torch
 from torch.utils.data import DataLoader
-from trainer.generic_utils import count_parameters, remove_experiment_folder
-from trainer.io import copy_model_files, save_best_model, save_checkpoint
+from trainer import TrainerArgs, TrainerConfig
+from trainer.generic_utils import count_parameters, get_experiment_folder_path, get_git_branch
+from trainer.io import copy_model_files, get_last_checkpoint, save_best_model, save_checkpoint
+from trainer.logging import BaseDashboardLogger, ConsoleLogger, logger_factory
 from trainer.torch import NoamLR
 from trainer.trainer_utils import get_optimizer
 
+from TTS.config import load_config, register_config
+from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
 from TTS.encoder.dataset import EncoderDataset
 from TTS.encoder.utils.generic_utils import setup_encoder_model
-from TTS.encoder.utils.training import init_training
 from TTS.encoder.utils.visual import plot_embeddings
 from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.text.characters import parse_symbols
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 from TTS.utils.samplers import PerfectBatchSampler
@@ -34,7 +39,77 @@
 print(" > Number of GPUs: ", num_gpus)
 
 
-def setup_loader(ap: AudioProcessor, is_val: bool = False):
+@dataclass
+class TrainArgs(TrainerArgs):
+    config_path: str | None = field(default=None, metadata={"help": "Path to the config file."})
+
+
+def process_args(
+    args, config: BaseEncoderConfig | None = None
+) -> tuple[BaseEncoderConfig, str, str, ConsoleLogger, BaseDashboardLogger | None]:
+    """Process parsed comand line arguments and initialize the config if not provided.
+    Args:
+        args (argparse.Namespace or dict like): Parsed input arguments.
+        config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
+    Returns:
+        c (Coqpit): Config paramaters.
+        out_path (str): Path to save models and logging.
+        audio_path (str): Path to save generated test audios.
+        c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
+            logging to the console.
+        dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
+    TODO:
+        - Interactive config definition.
+    """
+    coqpit_overrides = None
+    if isinstance(args, tuple):
+        args, coqpit_overrides = args
+    if args.continue_path:
+        # continue a previous training from its output folder
+        experiment_path = args.continue_path
+        args.config_path = os.path.join(args.continue_path, "config.json")
+        args.restore_path, best_model = get_last_checkpoint(args.continue_path)
+        if not args.best_path:
+            args.best_path = best_model
+    # init config if not already defined
+    if config is None:
+        if args.config_path:
+            # init from a file
+            config = load_config(args.config_path)
+        else:
+            # init from console args
+            from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel
+
+            config_base = BaseTrainingConfig()
+            config_base.parse_known_args(coqpit_overrides)
+            config = register_config(config_base.model)()
+    # override values from command-line args
+    config.parse_known_args(coqpit_overrides, relaxed_parser=True)
+    experiment_path = args.continue_path
+    if not experiment_path:
+        experiment_path = get_experiment_folder_path(config.output_path, config.run_name)
+    audio_path = os.path.join(experiment_path, "test_audios")
+    config.output_log_path = experiment_path
+    # setup rank 0 process in distributed training
+    dashboard_logger = None
+    if args.rank == 0:
+        new_fields = {}
+        if args.restore_path:
+            new_fields["restore_path"] = args.restore_path
+        new_fields["github_branch"] = get_git_branch()
+        # if model characters are not set in the config file
+        # save the default set to the config file for future
+        # compatibility.
+        if config.has("characters") and config.characters is None:
+            used_characters = parse_symbols()
+            new_fields["characters"] = used_characters
+        copy_model_files(config, experiment_path, new_fields)
+        dashboard_logger = logger_factory(config, experiment_path)
+    c_logger = ConsoleLogger()
+    return config, experiment_path, audio_path, c_logger, dashboard_logger
+
+
+def setup_loader(c: TrainerConfig, ap: AudioProcessor, is_val: bool = False):
     num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
     num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
 
@@ -84,10 +159,10 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False):
     return loader, classes, dataset.get_map_classid_to_classname()
 
 
-def evaluation(model, criterion, data_loader, global_step):
+def evaluation(c: BaseEncoderConfig, model, criterion, data_loader, global_step, dashboard_logger: BaseDashboardLogger):
     eval_loss = 0
     for _, data in enumerate(data_loader):
-        with torch.no_grad():
+        with torch.inference_mode():
             # setup input data
             inputs, labels = data
 
@@ -128,7 +203,17 @@ def evaluation(model, criterion, data_loader, global_step):
     return eval_avg_loss
 
 
-def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
+def train(
+    c: BaseEncoderConfig,
+    model,
+    optimizer,
+    scheduler,
+    criterion,
+    data_loader,
+    eval_data_loader,
+    global_step,
+    dashboard_logger: BaseDashboardLogger,
+):
     model.train()
     best_loss = {"train_loss": None, "eval_loss": float("inf")}
     avg_loader_time = 0
@@ -219,37 +304,33 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
 
             if global_step % c.print_step == 0:
                 print(
-                    "   | > Step:{}  Loss:{:.5f}  GradNorm:{:.5f}  "
-                    "StepTime:{:.2f}  LoaderTime:{:.2f}  AvGLoaderTime:{:.2f}  LR:{:.6f}".format(
-                        global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
-                    ),
+                    f"   | > Step:{global_step}  Loss:{loss.item():.5f}  GradNorm:{grad_norm:.5f}  "
+                    f"StepTime:{step_time:.2f}  LoaderTime:{loader_time:.2f}  AvGLoaderTime:{avg_loader_time:.2f}  LR:{current_lr:.6f}",
                     flush=True,
                 )
 
             if global_step % c.save_step == 0:
                 # save model
                 save_checkpoint(
-                    c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
+                    c, model, optimizer, None, global_step, epoch, c.output_log_path, criterion=criterion.state_dict()
                 )
 
             end_time = time.time()
 
         print("")
         print(
-            ">>> Epoch:{}  AvgLoss: {:.5f} GradNorm:{:.5f}  "
-            "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
-                epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
-            ),
+            f">>> Epoch:{epoch}  AvgLoss: {tot_loss / len(data_loader):.5f} GradNorm:{grad_norm:.5f}  "
+            f"EpochTime:{epoch_time:.2f} AvGLoaderTime:{avg_loader_time:.2f} ",
             flush=True,
         )
         # evaluation
         if c.run_eval:
             model.eval()
-            eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
+            eval_loss = evaluation(c, model, criterion, eval_data_loader, global_step, dashboard_logger)
             print("\n\n")
             print("--> EVAL PERFORMANCE")
             print(
-                "   | > Epoch:{}  AvgLoss: {:.5f} ".format(epoch, eval_loss),
+                f"   | > Epoch:{epoch}  AvgLoss: {eval_loss:.5f} ",
                 flush=True,
             )
             # save the best checkpoint
@@ -262,7 +343,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
                 None,
                 global_step,
                 epoch,
-                OUT_PATH,
+                c.output_log_path,
                 criterion=criterion.state_dict(),
             )
             model.train()
@@ -270,7 +351,13 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
     return best_loss, global_step
 
 
-def main(args):  # pylint: disable=redefined-outer-name
+def main(arg_list: list[str] | None = None):
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
+
+    train_config = TrainArgs()
+    parser = train_config.init_argparse(arg_prefix="")
+    args, overrides = parser.parse_known_args(arg_list)
+    c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args((args, overrides))
     # pylint: disable=global-variable-undefined
     global meta_data_train
     global meta_data_eval
@@ -284,9 +371,9 @@ def main(args):  # pylint: disable=redefined-outer-name
     # pylint: disable=redefined-outer-name
     meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
 
-    train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False)
+    train_data_loader, train_classes, map_classid_to_classname = setup_loader(c, ap, is_val=False)
     if c.run_eval:
-        eval_data_loader, _, _ = setup_loader(ap, is_val=True)
+        eval_data_loader, _, _ = setup_loader(c, ap, is_val=True)
     else:
         eval_data_loader = None
 
@@ -301,7 +388,7 @@ def main(args):  # pylint: disable=redefined-outer-name
         criterion, args.restore_step = model.load_checkpoint(
             c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
         )
-        print(" > Model restored from step %d" % args.restore_step, flush=True)
+        print(f" > Model restored from step {args.restore_step}", flush=True)
     else:
         args.restore_step = 0
 
@@ -311,30 +398,30 @@ def main(args):  # pylint: disable=redefined-outer-name
         scheduler = None
 
     num_params = count_parameters(model)
-    print("\n > Model has {} parameters".format(num_params), flush=True)
+    print(f"\n > Model has {num_params} parameters", flush=True)
 
     if use_cuda:
         model = model.cuda()
         criterion.cuda()
 
     global_step = args.restore_step
-    _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
+    _, global_step = train(
+        c, model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step, dashboard_logger
+    )
+    sys.exit(0)
 
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-
-    args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
-
-    try:
-        main(args)
-    except KeyboardInterrupt:
-        remove_experiment_folder(OUT_PATH)
-        try:
-            sys.exit(0)
-        except SystemExit:
-            os._exit(0)  # pylint: disable=protected-access
-    except Exception:  # pylint: disable=broad-except
-        remove_experiment_folder(OUT_PATH)
-        traceback.print_exc()
-        sys.exit(1)
+    main()
+    # try:
+    #     main()
+    # except KeyboardInterrupt:
+    #     remove_experiment_folder(OUT_PATH)
+    #     try:
+    #         sys.exit(0)
+    #     except SystemExit:
+    #         os._exit(0)  # pylint: disable=protected-access
+    # except Exception:  # pylint: disable=broad-except
+    #     remove_experiment_folder(OUT_PATH)
+    #     traceback.print_exc()
+    #     sys.exit(1)
diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py
index 6d6342a762..deaa350878 100644
--- a/TTS/bin/train_tts.py
+++ b/TTS/bin/train_tts.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import sys
 from dataclasses import dataclass, field
 
 from trainer import Trainer, TrainerArgs
@@ -15,16 +16,16 @@ class TrainTTSArgs(TrainerArgs):
     config_path: str = field(default=None, metadata={"help": "Path to the config file."})
 
 
-def main():
+def main(arg_list: list[str] | None = None):
     """Run `tts` model training directly by a `config.json` file."""
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     # init trainer args
     train_args = TrainTTSArgs()
     parser = train_args.init_argparse(arg_prefix="")
 
-    # override trainer args from comman-line args
-    args, config_overrides = parser.parse_known_args()
+    # override trainer args from command-line args
+    args, config_overrides = parser.parse_known_args(arg_list)
     train_args.parse_args(args)
 
     # load config.json and register
@@ -69,6 +70,7 @@ def main():
         parse_command_line_args=False,
     )
     trainer.fit()
+    sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py
index 221ff4cff0..58122b9005 100644
--- a/TTS/bin/train_vocoder.py
+++ b/TTS/bin/train_vocoder.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import sys
 from dataclasses import dataclass, field
 
 from trainer import Trainer, TrainerArgs
@@ -16,16 +17,16 @@ class TrainVocoderArgs(TrainerArgs):
     config_path: str = field(default=None, metadata={"help": "Path to the config file."})
 
 
-def main():
+def main(arg_list: list[str] | None = None):
     """Run `tts` model training directly by a `config.json` file."""
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     # init trainer args
     train_args = TrainVocoderArgs()
     parser = train_args.init_argparse(arg_prefix="")
 
     # override trainer args from comman-line args
-    args, config_overrides = parser.parse_known_args()
+    args, config_overrides = parser.parse_known_args(arg_list)
     train_args.parse_args(args)
 
     # load config.json and register
@@ -75,6 +76,7 @@ def main():
         parse_command_line_args=False,
     )
     trainer.fit()
+    sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py
index df2923952d..d05ae14b7f 100644
--- a/TTS/bin/tune_wavegrad.py
+++ b/TTS/bin/tune_wavegrad.py
@@ -2,6 +2,7 @@
 
 import argparse
 import logging
+import sys
 from itertools import product as cartesian_product
 
 import numpy as np
@@ -17,7 +18,7 @@
 from TTS.vocoder.models import setup_model
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py
index 5103f200b0..401003504e 100644
--- a/TTS/config/__init__.py
+++ b/TTS/config/__init__.py
@@ -1,7 +1,7 @@
 import json
 import os
 import re
-from typing import Dict
+from typing import Any, Union
 
 import fsspec
 import yaml
@@ -54,11 +54,11 @@ def register_config(model_name: str) -> Coqpit:
     return config_class
 
 
-def _process_model_name(config_dict: Dict) -> str:
+def _process_model_name(config_dict: dict) -> str:
     """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
 
     Args:
-        config_dict (Dict): A dictionary including the config fields.
+        config_dict (dict): A dictionary including the config fields.
 
     Returns:
         str: Formatted modelname.
@@ -68,7 +68,7 @@ def _process_model_name(config_dict: Dict) -> str:
     return model_name
 
 
-def load_config(config_path: str) -> Coqpit:
+def load_config(config_path: str | os.PathLike[Any]) -> Coqpit:
     """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
     to find the corresponding Config class. Then initialize the Config.
 
@@ -81,6 +81,7 @@ def load_config(config_path: str) -> Coqpit:
     Returns:
         Coqpit: TTS config object.
     """
+    config_path = str(config_path)
     config_dict = {}
     ext = os.path.splitext(config_path)[1]
     if ext in (".yml", ".yaml"):
diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py
index 7fae77d613..a0a013b0de 100644
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@@ -1,5 +1,4 @@
 from dataclasses import asdict, dataclass
-from typing import List
 
 from coqpit import Coqpit, check_argument
 from trainer import TrainerConfig
@@ -227,7 +226,7 @@ class BaseDatasetConfig(Coqpit):
     dataset_name: str = ""
     path: str = ""
     meta_file_train: str = ""
-    ignored_speakers: List[str] = None
+    ignored_speakers: list[str] = None
     language: str = ""
     phonemizer: str = ""
     meta_file_val: str = ""
diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py
index f838297af3..411a9b0dbe 100644
--- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py
+++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py
@@ -5,7 +5,8 @@
 
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
+from TTS.tts.models.xtts import XttsAudioConfig
 from TTS.utils.manage import ModelManager
 
 
diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py
index 7ac38ed6ee..dac5f0870a 100644
--- a/TTS/demos/xtts_ft_demo/xtts_demo.py
+++ b/TTS/demos/xtts_ft_demo/xtts_demo.py
@@ -104,7 +104,7 @@ def isatty(self):
 
 def read_logs():
     sys.stdout.flush()
-    with open(sys.stdout.log_file, "r") as f:
+    with open(sys.stdout.log_file) as f:
         return f.read()
 
 
diff --git a/TTS/encoder/configs/base_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py
index ebbaa0457b..d2d0ef580d 100644
--- a/TTS/encoder/configs/base_encoder_config.py
+++ b/TTS/encoder/configs/base_encoder_config.py
@@ -1,5 +1,4 @@
 from dataclasses import asdict, dataclass, field
-from typing import Dict, List
 
 from coqpit import MISSING
 
@@ -12,9 +11,9 @@ class BaseEncoderConfig(BaseTrainingConfig):
 
     model: str = None
     audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
-    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
+    datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
     # model params
-    model_params: Dict = field(
+    model_params: dict = field(
         default_factory=lambda: {
             "model_name": "lstm",
             "input_dim": 80,
@@ -25,7 +24,7 @@ class BaseEncoderConfig(BaseTrainingConfig):
         }
     )
 
-    audio_augmentation: Dict = field(default_factory=lambda: {})
+    audio_augmentation: dict = field(default_factory=lambda: {})
 
     # training params
     epochs: int = 10000
@@ -33,7 +32,7 @@ class BaseEncoderConfig(BaseTrainingConfig):
     grad_clip: float = 3.0
     lr: float = 0.0001
     optimizer: str = "radam"
-    optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
     lr_decay: bool = False
     warmup_steps: int = 4000
 
@@ -56,6 +55,6 @@ class BaseEncoderConfig(BaseTrainingConfig):
     def check_values(self):
         super().check_values()
         c = asdict(self)
-        assert (
-            c["model_params"]["input_dim"] == self.audio.num_mels
-        ), " [!] model input dimendion must be equal to melspectrogram dimension."
+        assert c["model_params"]["input_dim"] == self.audio.num_mels, (
+            " [!] model input dimendion must be equal to melspectrogram dimension."
+        )
diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py
index f7137c2186..c6680c3a25 100644
--- a/TTS/encoder/models/base_encoder.py
+++ b/TTS/encoder/models/base_encoder.py
@@ -5,10 +5,10 @@
 import torchaudio
 from coqpit import Coqpit
 from torch import nn
+from trainer.generic_utils import set_partial_state_dict
 from trainer.io import load_fsspec
 
 from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
-from TTS.utils.generic_utils import set_init_dict
 
 logger = logging.getLogger(__name__)
 
@@ -34,7 +34,7 @@ class BaseEncoder(nn.Module):
 
     # pylint: disable=W0102
     def __init__(self):
-        super(BaseEncoder, self).__init__()
+        super().__init__()
 
     def get_torch_mel_spectrogram_class(self, audio_config):
         return torch.nn.Sequential(
@@ -64,11 +64,11 @@ def get_torch_mel_spectrogram_class(self, audio_config):
             ),
         )
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, x, l2_norm=True):
         return self.forward(x, l2_norm)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
         """
         Generate embeddings for a batch of utterances
@@ -107,7 +107,7 @@ def get_criterion(self, c: Coqpit, num_classes=None):
         elif c.loss == "softmaxproto":
             criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
         else:
-            raise Exception("The %s  not is a loss supported" % c.loss)
+            raise Exception(f"The {c.loss}  not is a loss supported")
         return criterion
 
     def load_checkpoint(
@@ -130,7 +130,7 @@ def load_checkpoint(
 
             logger.info("Partial model initialization.")
             model_dict = self.state_dict()
-            model_dict = set_init_dict(model_dict, state["model"], c)
+            model_dict = set_partial_state_dict(model_dict, state["model"], config)
             self.load_state_dict(model_dict)
             del model_dict
 
diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py
index 5eafcd6005..d7f3a2f4bd 100644
--- a/TTS/encoder/models/resnet.py
+++ b/TTS/encoder/models/resnet.py
@@ -7,7 +7,7 @@
 
 class SELayer(nn.Module):
     def __init__(self, channel, reduction=8):
-        super(SELayer, self).__init__()
+        super().__init__()
         self.avg_pool = nn.AdaptiveAvgPool2d(1)
         self.fc = nn.Sequential(
             nn.Linear(channel, channel // reduction),
@@ -27,7 +27,7 @@ class SEBasicBlock(nn.Module):
     expansion = 1
 
     def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
-        super(SEBasicBlock, self).__init__()
+        super().__init__()
         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(planes)
         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
@@ -73,7 +73,7 @@ def __init__(
         use_torch_spec=False,
         audio_config=None,
     ):
-        super(ResNetSpeakerEncoder, self).__init__()
+        super().__init__()
 
         self.encoder_type = encoder_type
         self.input_dim = input_dim
diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py
index 495b4def5a..54ab37a52f 100644
--- a/TTS/encoder/utils/generic_utils.py
+++ b/TTS/encoder/utils/generic_utils.py
@@ -6,13 +6,14 @@
 import numpy as np
 from scipy import signal
 
+from TTS.encoder.models.base_encoder import BaseEncoder
 from TTS.encoder.models.lstm import LSTMSpeakerEncoder
 from TTS.encoder.models.resnet import ResNetSpeakerEncoder
 
 logger = logging.getLogger(__name__)
 
 
-class AugmentWAV(object):
+class AugmentWAV:
     def __init__(self, ap, augmentation_config):
         self.ap = ap
         self.use_additive_noise = False
@@ -120,7 +121,7 @@ def apply_one(self, audio):
         return self.additive_noise(noise_type, audio)
 
 
-def setup_encoder_model(config: "Coqpit"):
+def setup_encoder_model(config: "Coqpit") -> BaseEncoder:
     if config.model_params["model_name"].lower() == "lstm":
         model = LSTMSpeakerEncoder(
             config.model_params["input_dim"],
@@ -138,4 +139,7 @@ def setup_encoder_model(config: "Coqpit"):
             use_torch_spec=config.model_params.get("use_torch_spec", False),
             audio_config=config.audio,
         )
+    else:
+        msg = f"Model not supported: {config.model_params['model_name']}"
+        raise ValueError(msg)
     return model
diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py
index da7522a512..8d50ffd5f5 100644
--- a/TTS/encoder/utils/prepare_voxceleb.py
+++ b/TTS/encoder/utils/prepare_voxceleb.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo
 # All rights reserved.
 #
@@ -17,7 +16,7 @@
 # Only support eager mode and TF>=2.0.0
 # pylint: disable=no-member, invalid-name, relative-beyond-top-level
 # pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
-""" voxceleb 1 & 2 """
+"""voxceleb 1 & 2"""
 
 import csv
 import hashlib
@@ -81,19 +80,19 @@ def download_and_extract(directory, subset, urls):
             zip_filepath = os.path.join(directory, url.split("/")[-1])
             if os.path.exists(zip_filepath):
                 continue
-            logger.info("Downloading %s to %s" % (url, zip_filepath))
+            logger.info("Downloading %s to %s", url, zip_filepath)
             subprocess.call(
-                "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
+                "wget {} --user {} --password {} -O {}".format(url, USER["user"], USER["password"], zip_filepath),
                 shell=True,
             )
 
             statinfo = os.stat(zip_filepath)
-            logger.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
+            logger.info("Successfully downloaded %s, size(bytes): %d", url, statinfo.st_size)
 
         # concatenate all parts into zip files
         if ".zip" not in zip_filepath:
             zip_filepath = "_".join(zip_filepath.split("_")[:-1])
-            subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True)
+            subprocess.call(f"cat {zip_filepath}* > {zip_filepath}.zip", shell=True)
             zip_filepath += ".zip"
         extract_path = zip_filepath.strip(".zip")
 
@@ -101,12 +100,12 @@ def download_and_extract(directory, subset, urls):
         with open(zip_filepath, "rb") as f_zip:
             md5 = hashlib.md5(f_zip.read()).hexdigest()
         if md5 != MD5SUM[subset]:
-            raise ValueError("md5sum of %s mismatch" % zip_filepath)
+            raise ValueError(f"md5sum of {zip_filepath} mismatch")
 
         with zipfile.ZipFile(zip_filepath, "r") as zfile:
             zfile.extractall(directory)
             extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename)
-            subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True)
+            subprocess.call(f"mv {extract_path_ori} {extract_path}", shell=True)
     finally:
         # os.remove(zip_filepath)
         pass
@@ -122,9 +121,9 @@ def exec_cmd(cmd):
     try:
         retcode = subprocess.call(cmd, shell=True)
         if retcode < 0:
-            logger.info(f"Child was terminated by signal {retcode}")
+            logger.info("Child was terminated by signal %d", retcode)
     except OSError as e:
-        logger.info(f"Execution failed: {e}")
+        logger.info("Execution failed: %s", e)
         retcode = -999
     return retcode
 
@@ -138,10 +137,10 @@ def decode_aac_with_ffmpeg(aac_file, wav_file):
         bool, True if success.
     """
     cmd = f"ffmpeg -i {aac_file} {wav_file}"
-    logger.info(f"Decoding aac file using command line: {cmd}")
+    logger.info("Decoding aac file using command line: %s", cmd)
     ret = exec_cmd(cmd)
     if ret != 0:
-        logger.error(f"Failed to decode aac file with retcode {ret}")
+        logger.error("Failed to decode aac file with retcode %s", ret)
         logger.error("Please check your ffmpeg installation.")
         return False
     return True
@@ -156,7 +155,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
         output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
     """
 
-    logger.info("Preprocessing audio and label for subset %s" % subset)
+    logger.info("Preprocessing audio and label for subset %s", subset)
     source_dir = os.path.join(input_dir, subset)
 
     files = []
@@ -194,7 +193,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
         writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
         for wav_file in files:
             writer.writerow(wav_file)
-    logger.info("Successfully generated csv file {}".format(csv_file_path))
+    logger.info("Successfully generated csv file %s", csv_file_path)
 
 
 def processor(directory, subset, force_process):
@@ -216,7 +215,7 @@ def processor(directory, subset, force_process):
 
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
     if len(sys.argv) != 4:
         print("Usage: python prepare_data.py save_directory user password")
         sys.exit()
diff --git a/TTS/encoder/utils/training.py b/TTS/encoder/utils/training.py
deleted file mode 100644
index cc3a78b084..0000000000
--- a/TTS/encoder/utils/training.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import os
-from dataclasses import dataclass, field
-
-from coqpit import Coqpit
-from trainer import TrainerArgs, get_last_checkpoint
-from trainer.generic_utils import get_experiment_folder_path, get_git_branch
-from trainer.io import copy_model_files
-from trainer.logging import logger_factory
-from trainer.logging.console_logger import ConsoleLogger
-
-from TTS.config import load_config, register_config
-from TTS.tts.utils.text.characters import parse_symbols
-
-
-@dataclass
-class TrainArgs(TrainerArgs):
-    config_path: str = field(default=None, metadata={"help": "Path to the config file."})
-
-
-def getarguments():
-    train_config = TrainArgs()
-    parser = train_config.init_argparse(arg_prefix="")
-    return parser
-
-
-def process_args(args, config=None):
-    """Process parsed comand line arguments and initialize the config if not provided.
-    Args:
-        args (argparse.Namespace or dict like): Parsed input arguments.
-        config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
-    Returns:
-        c (Coqpit): Config paramaters.
-        out_path (str): Path to save models and logging.
-        audio_path (str): Path to save generated test audios.
-        c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
-            logging to the console.
-        dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
-    TODO:
-        - Interactive config definition.
-    """
-    if isinstance(args, tuple):
-        args, coqpit_overrides = args
-    if args.continue_path:
-        # continue a previous training from its output folder
-        experiment_path = args.continue_path
-        args.config_path = os.path.join(args.continue_path, "config.json")
-        args.restore_path, best_model = get_last_checkpoint(args.continue_path)
-        if not args.best_path:
-            args.best_path = best_model
-    # init config if not already defined
-    if config is None:
-        if args.config_path:
-            # init from a file
-            config = load_config(args.config_path)
-        else:
-            # init from console args
-            from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel
-
-            config_base = BaseTrainingConfig()
-            config_base.parse_known_args(coqpit_overrides)
-            config = register_config(config_base.model)()
-    # override values from command-line args
-    config.parse_known_args(coqpit_overrides, relaxed_parser=True)
-    experiment_path = args.continue_path
-    if not experiment_path:
-        experiment_path = get_experiment_folder_path(config.output_path, config.run_name)
-    audio_path = os.path.join(experiment_path, "test_audios")
-    config.output_log_path = experiment_path
-    # setup rank 0 process in distributed training
-    dashboard_logger = None
-    if args.rank == 0:
-        new_fields = {}
-        if args.restore_path:
-            new_fields["restore_path"] = args.restore_path
-        new_fields["github_branch"] = get_git_branch()
-        # if model characters are not set in the config file
-        # save the default set to the config file for future
-        # compatibility.
-        if config.has("characters") and config.characters is None:
-            used_characters = parse_symbols()
-            new_fields["characters"] = used_characters
-        copy_model_files(config, experiment_path, new_fields)
-        dashboard_logger = logger_factory(config, experiment_path)
-    c_logger = ConsoleLogger()
-    return config, experiment_path, audio_path, c_logger, dashboard_logger
-
-
-def init_arguments():
-    train_config = TrainArgs()
-    parser = train_config.init_argparse(arg_prefix="")
-    return parser
-
-
-def init_training(config: Coqpit = None):
-    """Initialization of a training run."""
-    parser = init_arguments()
-    args = parser.parse_known_args()
-    config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config)
-    return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger
diff --git a/TTS/model.py b/TTS/model.py
index c3707c85ae..39faa7f690 100644
--- a/TTS/model.py
+++ b/TTS/model.py
@@ -1,6 +1,6 @@
 import os
 from abc import abstractmethod
-from typing import Any, Union
+from typing import Any
 
 import torch
 from coqpit import Coqpit
@@ -12,7 +12,7 @@
 class BaseTrainerModel(TrainerModel):
     """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
 
-    Every new 🐸TTS model must inherit it.
+    Every new Coqui model must inherit it.
     """
 
     @staticmethod
@@ -48,7 +48,7 @@ def inference(self, input: torch.Tensor, aux_input: dict[str, Any] = {}) -> dict
     def load_checkpoint(
         self,
         config: Coqpit,
-        checkpoint_path: Union[str, os.PathLike[Any]],
+        checkpoint_path: str | os.PathLike[Any],
         eval: bool = False,
         strict: bool = True,
         cache: bool = False,
@@ -64,3 +64,7 @@ def load_checkpoint(
                 It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False.
         """
         ...
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
diff --git a/TTS/server/README.md b/TTS/server/README.md
index ae8e38a4e3..232b8618d8 100644
--- a/TTS/server/README.md
+++ b/TTS/server/README.md
@@ -1,21 +1,36 @@
-# :frog: TTS demo server
+# :frog: TTS Demo Server
 Before you use the server, make sure you
-[install](https://github.com/idiap/coqui-ai-TTS/tree/dev#install-tts)) :frog: TTS
+[install](https://github.com/idiap/coqui-ai-TTS/tree/dev#install-tts) :frog: TTS
 properly and install the additional dependencies with `pip install
 coqui-tts[server]`. Then, you can follow the steps below.
 
-**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
+**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` endpoint on the terminal instead of the `python TTS/server/server.py` arguments.
 
-Examples runs:
+## Example commands
 
-List officially released models.
-```python TTS/server/server.py  --list_models ```
+List officially released models:
+```bash
+python TTS/server/server.py --list_models  # or
+tts-server --list_models
+```
 
-Run the server with the official models.
-```python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
+Run the server with the official models:
+```bash
+python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA \
+       --vocoder_name vocoder_models/en/ljspeech/multiband-melgan
+```
 
-Run the server with the official models on a GPU.
-```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda```
+Run the server with the official models on a GPU:
+```bash
+CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py \
+    --model_name tts_models/en/ljspeech/tacotron2-DCA
+    --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda
+```
 
-Run the server with a custom models.
-```python TTS/server/server.py  --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
+Run the server with a custom models:
+```bash
+python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth \
+       --tts_config /path/to/tts/config.json \
+       --vocoder_checkpoint /path/to/vocoder/model.pth \
+       --vocoder_config /path/to/vocoder/config.json
+```
diff --git a/TTS/server/server.py b/TTS/server/server.py
index f410fb7539..500c706c4e 100644
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@@ -8,9 +8,7 @@
 import logging
 import os
 import sys
-from pathlib import Path
 from threading import Lock
-from typing import Union
 from urllib.parse import parse_qs
 
 try:
@@ -19,13 +17,12 @@
     msg = "Server requires requires flask, use `pip install coqui-tts[server]`"
     raise ImportError(msg) from e
 
-from TTS.config import load_config
+from TTS.api import TTS
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 from TTS.utils.manage import ModelManager
-from TTS.utils.synthesizer import Synthesizer
 
 logger = logging.getLogger(__name__)
-setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
 
 def create_argparser() -> argparse.ArgumentParser:
@@ -60,6 +57,7 @@ def create_argparser() -> argparse.ArgumentParser:
     parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
     parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
     parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
+    parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
     parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="true to use CUDA.")
     parser.add_argument(
         "--debug", action=argparse.BooleanOptionalAction, default=False, help="true to enable Flask debug mode."
@@ -73,8 +71,7 @@ def create_argparser() -> argparse.ArgumentParser:
 # parse the args
 args = create_argparser().parse_args()
 
-path = Path(__file__).parent / "../.models.json"
-manager = ModelManager(path)
+manager = ModelManager(models_file=TTS.get_models_file_path())
 
 # update in-use models to the specified released models.
 model_path = None
@@ -86,55 +83,32 @@ def create_argparser() -> argparse.ArgumentParser:
 # CASE1: list pre-trained TTS models
 if args.list_models:
     manager.list_models()
-    sys.exit()
-
-# CASE2: load pre-trained model paths
-if args.model_name is not None and not args.model_path:
-    model_path, config_path, model_item = manager.download_model(args.model_name)
-    args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
-
-if args.vocoder_name is not None and not args.vocoder_path:
-    vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
-
-# CASE3: set custom model paths
-if args.model_path is not None:
-    model_path = args.model_path
-    config_path = args.config_path
-    speakers_file_path = args.speakers_file_path
-
-if args.vocoder_path is not None:
-    vocoder_path = args.vocoder_path
-    vocoder_config_path = args.vocoder_config_path
-
-# load models
-synthesizer = Synthesizer(
-    tts_checkpoint=model_path,
-    tts_config_path=config_path,
-    tts_speakers_file=speakers_file_path,
-    tts_languages_file=None,
-    vocoder_checkpoint=vocoder_path,
-    vocoder_config=vocoder_config_path,
-    encoder_checkpoint="",
-    encoder_config="",
-    use_cuda=args.use_cuda,
-)
-
-use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
-    synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
-)
-speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
-
-use_multi_language = hasattr(synthesizer.tts_model, "num_languages") and (
-    synthesizer.tts_model.num_languages > 1 or synthesizer.tts_languages_file is not None
-)
-language_manager = getattr(synthesizer.tts_model, "language_manager", None)
+    sys.exit(0)
+
+device = args.device
+if args.use_cuda:
+    device = "cuda"
+
+# CASE2: load models
+model_name = args.model_name if args.model_path is None else None
+api = TTS(
+    model_name=model_name,
+    model_path=args.model_path,
+    config_path=args.config_path,
+    vocoder_name=args.vocoder_name,
+    vocoder_path=args.vocoder_path,
+    vocoder_config_path=args.vocoder_config_path,
+    speakers_file_path=args.speakers_file_path,
+    # language_ids_file_path=args.language_ids_file_path,
+).to(device)
 
 # TODO: set this from SpeakerManager
-use_gst = synthesizer.tts_config.get("use_gst", False)
+use_gst = api.synthesizer.tts_config.get("use_gst", False)
+supports_cloning = api.synthesizer.tts_config.get("model", "") in ["xtts", "bark"]
 app = Flask(__name__)
 
 
-def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
+def style_wav_uri_to_dict(style_wav: str) -> str | dict:
     """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer)
     or a dict (gst tokens/values to be use for styling)
 
@@ -158,27 +132,19 @@ def index():
     return render_template(
         "index.html",
         show_details=args.show_details,
-        use_multi_speaker=use_multi_speaker,
-        use_multi_language=use_multi_language,
-        speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
-        language_ids=language_manager.name_to_id if language_manager is not None else None,
+        use_multi_speaker=api.is_multi_speaker,
+        use_multi_language=api.is_multi_lingual,
+        speaker_ids=api.speakers,
+        language_ids=api.languages,
         use_gst=use_gst,
+        supports_cloning=supports_cloning,
     )
 
 
 @app.route("/details")
 def details():
-    if args.config_path is not None and os.path.isfile(args.config_path):
-        model_config = load_config(args.config_path)
-    elif args.model_name is not None:
-        model_config = load_config(config_path)
-
-    if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path):
-        vocoder_config = load_config(args.vocoder_config_path)
-    elif args.vocoder_name is not None:
-        vocoder_config = load_config(vocoder_config_path)
-    else:
-        vocoder_config = None
+    model_config = api.synthesizer.tts_config
+    vocoder_config = api.synthesizer.vocoder_config or None
 
     return render_template(
         "details.html",
@@ -196,17 +162,26 @@ def details():
 def tts():
     with lock:
         text = request.headers.get("text") or request.values.get("text", "")
-        speaker_idx = request.headers.get("speaker-id") or request.values.get("speaker_id", "")
-        language_idx = request.headers.get("language-id") or request.values.get("language_id", "")
+        speaker_idx = (
+            request.headers.get("speaker-id") or request.values.get("speaker_id", "") if api.is_multi_speaker else None
+        )
+        if speaker_idx == "":
+            speaker_idx = None
+        language_idx = (
+            request.headers.get("language-id") or request.values.get("language_id", "")
+            if api.is_multi_lingual
+            else None
+        )
         style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
         style_wav = style_wav_uri_to_dict(style_wav)
+        speaker_wav = request.headers.get("speaker-wav") or request.values.get("speaker_wav", "")
 
         logger.info("Model input: %s", text)
         logger.info("Speaker idx: %s", speaker_idx)
         logger.info("Language idx: %s", language_idx)
-        wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
+        wavs = api.tts(text, speaker=speaker_idx, language=language_idx, style_wav=style_wav, speaker_wav=speaker_wav)
         out = io.BytesIO()
-        synthesizer.save_wav(wavs, out)
+        api.synthesizer.save_wav(wavs, out)
     return send_file(out, mimetype="audio/wav")
 
 
@@ -248,9 +223,9 @@ def mary_tts_api_process():
         else:
             text = request.args.get("INPUT_TEXT", "")
         logger.info("Model input: %s", text)
-        wavs = synthesizer.tts(text)
+        wavs = api.tts(text)
         out = io.BytesIO()
-        synthesizer.save_wav(wavs, out)
+        api.synthesizer.save_wav(wavs, out)
     return send_file(out, mimetype="audio/wav")
 
 
diff --git a/TTS/server/templates/index.html b/TTS/server/templates/index.html
index 6bfd5ae2cb..95d7076394 100644
--- a/TTS/server/templates/index.html
+++ b/TTS/server/templates/index.html
@@ -66,7 +66,12 @@
 
                 {%if use_gst%}
                 <input value='{"0": 0.1}' id="style_wav" placeholder="style wav (dict or path to wav).." size=45
-                    type="text" name="style_wav">
+                    type="text" name="style_wav"><br /><br />
+                {%endif%}
+
+                {%if supports_cloning%}
+                Reference audio:
+                <input id="speaker_wav" placeholder="path/to/speaker.wav" name="speaker_wav" accept=".wav"><br /><br />
                 {%endif%}
 
                 <input id="text" placeholder="Type here..." size=45 type="text" name="text">
@@ -114,14 +119,18 @@
         q('#text').focus()
         function do_tts(e) {
             const text = q('#text').value
-            const speaker_id = getTextValue('#speaker_id')
             const style_wav = getTextValue('#style_wav')
+            const speaker_wav = getTextValue('#speaker_wav')
+            let speaker_id = getTextValue('#speaker_id')
+            if (speaker_wav !== '') {
+                speaker_id = ''
+            }
             const language_id = getTextValue('#language_id')
             if (text) {
                 q('#message').textContent = 'Synthesizing...'
                 q('#speak-button').disabled = true
                 q('#audio').hidden = true
-                synthesize(text, speaker_id, style_wav, language_id)
+                synthesize(text, speaker_id, style_wav, speaker_wav, language_id)
             }
             e.preventDefault()
             return false
@@ -132,8 +141,8 @@
                 do_tts(e)
             }
         })
-        function synthesize(text, speaker_id = "", style_wav = "", language_id = "") {
-            fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}&language_id=${encodeURIComponent(language_id)}`, { cache: 'no-cache' })
+        function synthesize(text, speaker_id = "", style_wav = "", speaker_wav = "", language_id = "") {
+            fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}&speaker_wav=${encodeURIComponent(speaker_wav)}&language_id=${encodeURIComponent(language_id)}`, { cache: 'no-cache' })
                 .then(function (res) {
                     if (!res.ok) throw Error(res.statusText)
                     return res.blob()
diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py
index 317a01af53..784819eee3 100644
--- a/TTS/tts/configs/align_tts_config.py
+++ b/TTS/tts/configs/align_tts_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.align_tts import AlignTTSArgs
@@ -70,7 +69,7 @@ class AlignTTSConfig(BaseTTSConfig):
     model: str = "align_tts"
     # model specific params
     model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs)
-    phase_start_steps: List[int] = None
+    phase_start_steps: list[int] | None = None
 
     ssim_alpha: float = 1.0
     spec_loss_alpha: float = 1.0
@@ -80,13 +79,13 @@ class AlignTTSConfig(BaseTTSConfig):
     # multi-speaker settings
     use_speaker_embedding: bool = False
     use_d_vector_file: bool = False
-    d_vector_file: str = False
+    d_vector_file: str | None = None
 
     # optimizer parameters
     optimizer: str = "Adam"
     optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
-    lr_scheduler: str = None
-    lr_scheduler_params: dict = None
+    lr_scheduler: str | None = None
+    lr_scheduler_params: dict | None = None
     lr: float = 1e-4
     grad_clip: float = 5.0
 
@@ -96,7 +95,7 @@ class AlignTTSConfig(BaseTTSConfig):
     r: int = 1
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] | list[list[str]] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
diff --git a/TTS/tts/configs/bark_config.py b/TTS/tts/configs/bark_config.py
index b846febe85..61d67b987a 100644
--- a/TTS/tts/configs/bark_config.py
+++ b/TTS/tts/configs/bark_config.py
@@ -1,6 +1,5 @@
 import os
 from dataclasses import dataclass, field
-from typing import Dict
 
 from trainer.io import get_user_data_dir
 
@@ -70,9 +69,9 @@ class BarkConfig(BaseTTSConfig):
     COARSE_INFER_TOKEN: int = 12_050
 
     REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
-    REMOTE_MODEL_PATHS: Dict = None
-    LOCAL_MODEL_PATHS: Dict = None
-    SMALL_REMOTE_MODEL_PATHS: Dict = None
+    REMOTE_MODEL_PATHS: dict = None
+    LOCAL_MODEL_PATHS: dict = None
+    SMALL_REMOTE_MODEL_PATHS: dict = None
     CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0"))
     DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers"))
 
diff --git a/TTS/tts/configs/delightful_tts_config.py b/TTS/tts/configs/delightful_tts_config.py
index 805d995369..fc9a76f613 100644
--- a/TTS/tts/configs/delightful_tts_config.py
+++ b/TTS/tts/configs/delightful_tts_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig
@@ -73,7 +72,7 @@ class DelightfulTTSConfig(BaseTTSConfig):
 
     # optimizer
     steps_to_start_discriminator: int = 200000
-    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+    grad_clip: list[float] = field(default_factory=lambda: [1000, 1000])
     lr_gen: float = 0.0002
     lr_disc: float = 0.0002
     lr_scheduler_gen: str = "ExponentialLR"
@@ -140,7 +139,7 @@ class DelightfulTTSConfig(BaseTTSConfig):
     d_vector_dim: int = None
 
     # testing
-    test_sentences: List[List[str]] = field(
+    test_sentences: list[str] | list[list[str]] = field(
         default_factory=lambda: [
             ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
             ["Be a voice, not an echo."],
diff --git a/TTS/tts/configs/fast_pitch_config.py b/TTS/tts/configs/fast_pitch_config.py
index d086d26564..1342856668 100644
--- a/TTS/tts/configs/fast_pitch_config.py
+++ b/TTS/tts/configs/fast_pitch_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.forward_tts import ForwardTTSArgs
@@ -117,10 +116,10 @@ class FastPitchConfig(BaseTTSConfig):
 
     # multi-speaker settings
     num_speakers: int = 0
-    speakers_file: str = None
+    speakers_file: str | None = None
     use_speaker_embedding: bool = False
     use_d_vector_file: bool = False
-    d_vector_file: str = False
+    d_vector_file: str | None = None
     d_vector_dim: int = 0
 
     # optimizer parameters
@@ -150,10 +149,10 @@ class FastPitchConfig(BaseTTSConfig):
 
     # dataset configs
     compute_f0: bool = True
-    f0_cache_path: str = None
+    f0_cache_path: str | None = None
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] | list[list[str]] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
diff --git a/TTS/tts/configs/fast_speech_config.py b/TTS/tts/configs/fast_speech_config.py
index af6c2db6fa..408dbab196 100644
--- a/TTS/tts/configs/fast_speech_config.py
+++ b/TTS/tts/configs/fast_speech_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.forward_tts import ForwardTTSArgs
@@ -111,10 +110,10 @@ class FastSpeechConfig(BaseTTSConfig):
 
     # multi-speaker settings
     num_speakers: int = 0
-    speakers_file: str = None
+    speakers_file: str | None = None
     use_speaker_embedding: bool = False
     use_d_vector_file: bool = False
-    d_vector_file: str = False
+    d_vector_file: str | None = None
     d_vector_dim: int = 0
 
     # optimizer parameters
@@ -144,10 +143,10 @@ class FastSpeechConfig(BaseTTSConfig):
 
     # dataset configs
     compute_f0: bool = False
-    f0_cache_path: str = None
+    f0_cache_path: str | None = None
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] | list[list[str]] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
diff --git a/TTS/tts/configs/fastspeech2_config.py b/TTS/tts/configs/fastspeech2_config.py
index d179617fb0..44bdefad0d 100644
--- a/TTS/tts/configs/fastspeech2_config.py
+++ b/TTS/tts/configs/fastspeech2_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.forward_tts import ForwardTTSArgs
@@ -127,10 +126,10 @@ class Fastspeech2Config(BaseTTSConfig):
 
     # multi-speaker settings
     num_speakers: int = 0
-    speakers_file: str = None
+    speakers_file: str | None = None
     use_speaker_embedding: bool = False
     use_d_vector_file: bool = False
-    d_vector_file: str = False
+    d_vector_file: str | None = None
     d_vector_dim: int = 0
 
     # optimizer parameters
@@ -161,14 +160,14 @@ class Fastspeech2Config(BaseTTSConfig):
 
     # dataset configs
     compute_f0: bool = True
-    f0_cache_path: str = None
+    f0_cache_path: str | None = None
 
     # dataset configs
     compute_energy: bool = True
-    energy_cache_path: str = None
+    energy_cache_path: str | None = None
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] | list[list[str]] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py
index f42f3e5a51..c99e920b9d 100644
--- a/TTS/tts/configs/glow_tts_config.py
+++ b/TTS/tts/configs/glow_tts_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 
@@ -101,7 +100,7 @@ class GlowTTSConfig(BaseTTSConfig):
     model: str = "glow_tts"
 
     # model params
-    num_chars: int = None
+    num_chars: int | None = None
     encoder_type: str = "rel_pos_transformer"
     encoder_params: dict = field(
         default_factory=lambda: {
@@ -147,15 +146,15 @@ class GlowTTSConfig(BaseTTSConfig):
     data_dep_init_steps: int = 10
 
     # inference params
-    style_wav_for_test: str = None
+    style_wav_for_test: str | None = None
     inference_noise_scale: float = 0.0
     length_scale: float = 1.0
 
     # multi-speaker settings
     use_speaker_embedding: bool = False
-    speakers_file: str = None
+    speakers_file: str | None = None
     use_d_vector_file: bool = False
-    d_vector_file: str = False
+    d_vector_file: str | None = None
 
     # optimizer parameters
     optimizer: str = "RAdam"
@@ -171,7 +170,7 @@ class GlowTTSConfig(BaseTTSConfig):
     r: int = 1  # DO NOT CHANGE - TODO: make this immutable once coqpit implements it.
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] | list[list[str]] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
diff --git a/TTS/tts/configs/neuralhmm_tts_config.py b/TTS/tts/configs/neuralhmm_tts_config.py
index 50f72847ed..108f2022d4 100644
--- a/TTS/tts/configs/neuralhmm_tts_config.py
+++ b/TTS/tts/configs/neuralhmm_tts_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 
@@ -126,7 +125,7 @@ class NeuralhmmTTSConfig(BaseTTSConfig):
     memory_rnn_dim: int = 1024
 
     ## Outputnet parameters
-    outputnet_size: List[int] = field(default_factory=lambda: [1024])
+    outputnet_size: list[int] = field(default_factory=lambda: [1024])
     flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
     std_floor: float = 0.001
 
@@ -143,7 +142,7 @@ class NeuralhmmTTSConfig(BaseTTSConfig):
     min_audio_len: int = 512
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] | list[list[str]] = field(
         default_factory=lambda: [
             "Be a voice, not an echo.",
         ]
@@ -162,9 +161,9 @@ def check_values(self):
             AssertionError: transition probability is not between 0 and 1
         """
         assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
-        assert (
-            len(self.outputnet_size) >= 1
-        ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
-        assert (
-            0 < self.flat_start_params["transition_p"] < 1
-        ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
+        assert len(self.outputnet_size) >= 1, (
+            f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
+        )
+        assert 0 < self.flat_start_params["transition_p"] < 1, (
+            f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
+        )
diff --git a/TTS/tts/configs/overflow_config.py b/TTS/tts/configs/overflow_config.py
index dc3e5548b8..9e96aaa441 100644
--- a/TTS/tts/configs/overflow_config.py
+++ b/TTS/tts/configs/overflow_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 
@@ -145,7 +144,7 @@ class OverflowConfig(BaseTTSConfig):  # The classname has to be camel case
     memory_rnn_dim: int = 1024
 
     ## Outputnet parameters
-    outputnet_size: List[int] = field(default_factory=lambda: [1024])
+    outputnet_size: list[int] = field(default_factory=lambda: [1024])
     flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
     std_floor: float = 0.01
 
@@ -174,7 +173,7 @@ class OverflowConfig(BaseTTSConfig):  # The classname has to be camel case
     min_audio_len: int = 512
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] | list[list[str]] = field(
         default_factory=lambda: [
             "Be a voice, not an echo.",
         ]
@@ -193,9 +192,9 @@ def check_values(self):
             AssertionError: transition probability is not between 0 and 1
         """
         assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
-        assert (
-            len(self.outputnet_size) >= 1
-        ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
-        assert (
-            0 < self.flat_start_params["transition_p"] < 1
-        ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
+        assert len(self.outputnet_size) >= 1, (
+            f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
+        )
+        assert 0 < self.flat_start_params["transition_p"] < 1, (
+            f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
+        )
diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index bf17322c19..c62f68306d 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -1,5 +1,4 @@
 from dataclasses import asdict, dataclass, field
-from typing import Dict, List
 
 from coqpit import Coqpit, check_argument
 
@@ -138,7 +137,7 @@ class CharactersConfig(Coqpit):
     characters_class: str = None
 
     # using BaseVocabulary
-    vocab_dict: Dict = None
+    vocab_dict: dict = None
 
     # using on BaseCharacters
     pad: str = None
@@ -323,7 +322,7 @@ class BaseTTSConfig(BaseTrainingConfig):
     shuffle: bool = False
     drop_last: bool = False
     # dataset
-    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
+    datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
     # optimizer
     optimizer: str = "radam"
     optimizer_params: dict = None
@@ -331,7 +330,7 @@ class BaseTTSConfig(BaseTrainingConfig):
     lr_scheduler: str = None
     lr_scheduler_params: dict = field(default_factory=lambda: {})
     # testing
-    test_sentences: List[str] = field(default_factory=lambda: [])
+    test_sentences: list[str] | list[list[str]] = field(default_factory=lambda: [])
     # evaluation
     eval_split_max_size: int = None
     eval_split_size: float = 0.01
diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py
index bf8517dfc4..b37ba174bf 100644
--- a/TTS/tts/configs/speedy_speech_config.py
+++ b/TTS/tts/configs/speedy_speech_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.forward_tts import ForwardTTSArgs
@@ -129,10 +128,10 @@ class SpeedySpeechConfig(BaseTTSConfig):
 
     # multi-speaker settings
     num_speakers: int = 0
-    speakers_file: str = None
+    speakers_file: str | None = None
     use_speaker_embedding: bool = False
     use_d_vector_file: bool = False
-    d_vector_file: str = False
+    d_vector_file: str | None = None
     d_vector_dim: int = 0
 
     # optimizer parameters
@@ -161,10 +160,10 @@ class SpeedySpeechConfig(BaseTTSConfig):
 
     # dataset configs
     compute_f0: bool = False
-    f0_cache_path: str = None
+    f0_cache_path: str | None = None
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] | list[list[str]] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py
index 350b5ea996..caa118815a 100644
--- a/TTS/tts/configs/tacotron_config.py
+++ b/TTS/tts/configs/tacotron_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig, CapacitronVAEConfig, GSTConfig
 
@@ -154,7 +153,7 @@ class TacotronConfig(BaseTTSConfig):
     num_speakers: int = 1
     num_chars: int = 0
     r: int = 2
-    gradual_training: List[List[int]] = None
+    gradual_training: list[list[int]] = None
     memory_size: int = -1
     prenet_type: str = "original"
     prenet_dropout: bool = True
@@ -170,7 +169,7 @@ class TacotronConfig(BaseTTSConfig):
 
     # attention layers
     attention_type: str = "original"
-    attention_heads: int = None
+    attention_heads: int | None = None
     attention_norm: str = "sigmoid"
     attention_win: bool = False
     windowing: bool = False
@@ -189,8 +188,8 @@ class TacotronConfig(BaseTTSConfig):
     use_speaker_embedding: bool = False
     speaker_embedding_dim: int = 512
     use_d_vector_file: bool = False
-    d_vector_file: str = False
-    d_vector_dim: int = None
+    d_vector_file: str | None = None
+    d_vector_dim: int | None = None
 
     # optimizer parameters
     optimizer: str = "RAdam"
@@ -212,7 +211,7 @@ class TacotronConfig(BaseTTSConfig):
     ga_alpha: float = 5.0
 
     # testing
-    test_sentences: List[str] = field(
+    test_sentences: list[str] | list[list[str]] = field(
         default_factory=lambda: [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
@@ -224,12 +223,12 @@ class TacotronConfig(BaseTTSConfig):
 
     def check_values(self):
         if self.gradual_training:
-            assert (
-                self.gradual_training[0][1] == self.r
-            ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
+            assert self.gradual_training[0][1] == self.r, (
+                f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
+            )
         if self.model == "tacotron" and self.audio is not None:
-            assert self.out_channels == (
-                self.audio.fft_size // 2 + 1
-            ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}"
+            assert self.out_channels == (self.audio.fft_size // 2 + 1), (
+                f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}"
+            )
         if self.model == "tacotron2" and self.audio is not None:
             assert self.out_channels == self.audio.num_mels
diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py
index 2d0242bf13..9ad720da30 100644
--- a/TTS/tts/configs/vits_config.py
+++ b/TTS/tts/configs/vits_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
@@ -112,7 +111,7 @@ class VitsConfig(BaseTTSConfig):
     audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
 
     # optimizer
-    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+    grad_clip: list[float] = field(default_factory=lambda: [1000, 1000])
     lr_gen: float = 0.0002
     lr_disc: float = 0.0002
     lr_scheduler_gen: str = "ExponentialLR"
@@ -146,7 +145,7 @@ class VitsConfig(BaseTTSConfig):
     add_blank: bool = True
 
     # testing
-    test_sentences: List[List] = field(
+    test_sentences: list[str] | list[list[str]] = field(
         default_factory=lambda: [
             ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
             ["Be a voice, not an echo."],
@@ -167,7 +166,7 @@ class VitsConfig(BaseTTSConfig):
 
     # use d-vectors
     use_d_vector_file: bool = False
-    d_vector_file: List[str] = None
+    d_vector_file: list[str] = None
     d_vector_dim: int = None
 
     def __post_init__(self):
diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py
index a0766d425c..1ebce57ba5 100644
--- a/TTS/tts/configs/xtts_config.py
+++ b/TTS/tts/configs/xtts_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig
@@ -70,7 +69,7 @@ class XttsConfig(BaseTTSConfig):
     model_args: XttsArgs = field(default_factory=XttsArgs)
     audio: XttsAudioConfig = field(default_factory=XttsAudioConfig)
     model_dir: str = None
-    languages: List[str] = field(
+    languages: list[str] = field(
         default_factory=lambda: [
             "en",
             "es",
diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
index d1a37da4c1..d83abce00a 100644
--- a/TTS/tts/datasets/__init__.py
+++ b/TTS/tts/datasets/__init__.py
@@ -2,8 +2,8 @@
 import os
 import sys
 from collections import Counter
+from collections.abc import Callable
 from pathlib import Path
-from typing import Callable, Dict, List, Tuple, Union
 
 import numpy as np
 
@@ -17,7 +17,7 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
     """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
 
     Args:
-        items (List[List]):
+        items (list[list]):
             A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
 
         eval_split_max_size (int):
@@ -37,10 +37,8 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
         else:
             eval_split_size = int(len(items) * eval_split_size)
 
-    assert (
-        eval_split_size > 0
-    ), " [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {}".format(
-        1 / len(items)
+    assert eval_split_size > 0, (
+        f" [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {1 / len(items)}"
     )
     np.random.seed(0)
     np.random.shuffle(items)
@@ -71,18 +69,18 @@ def add_extra_keys(metadata, language, dataset_name):
 
 
 def load_tts_samples(
-    datasets: Union[List[Dict], Dict],
+    datasets: list[dict] | dict,
     eval_split=True,
     formatter: Callable = None,
     eval_split_max_size=None,
     eval_split_size=0.01,
-) -> Tuple[List[List], List[List]]:
-    """Parse the dataset from the datasets config, load the samples as a List and load the attention alignments if provided.
+) -> tuple[list[list], list[list]]:
+    """Parse the dataset from the datasets config, load the samples as a list and load the attention alignments if provided.
     If `formatter` is not None, apply the formatter to the samples else pick the formatter from the available ones based
     on the dataset name.
 
     Args:
-        datasets (List[Dict], Dict): A list of datasets or a single dataset dictionary. If multiple datasets are
+        datasets (list[dict], dict): A list of datasets or a single dataset dictionary. If multiple datasets are
             in the list, they are all merged.
 
         eval_split (bool, optional): If true, create a evaluation split. If an eval split provided explicitly, generate
@@ -101,7 +99,7 @@ def load_tts_samples(
             If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
 
     Returns:
-        Tuple[List[List], List[List]: training and evaluation splits of the dataset.
+        tuple[list[list], list[list]: training and evaluation splits of the dataset.
     """
     meta_data_train_all = []
     meta_data_eval_all = [] if eval_split else None
@@ -153,7 +151,7 @@ def load_tts_samples(
 
 def load_attention_mask_meta_data(metafile_path):
     """Load meta data file created by compute_attention_masks.py"""
-    with open(metafile_path, "r", encoding="utf-8") as f:
+    with open(metafile_path, encoding="utf-8") as f:
         lines = f.readlines()
 
     meta_data = []
diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index 37e3a1779d..6f21dcd1e0 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -3,7 +3,7 @@
 import logging
 import os
 import random
-from typing import Any, Optional, Union
+from typing import Any
 
 import numpy as np
 import numpy.typing as npt
@@ -47,7 +47,7 @@ def string2filename(string: str) -> str:
     return base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
 
 
-def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int:
+def get_audio_size(audiopath: str | os.PathLike[Any]) -> int:
     """Return the number of samples in the audio file."""
     if not isinstance(audiopath, str):
         audiopath = str(audiopath)
@@ -63,29 +63,54 @@ def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int:
         raise RuntimeError(msg) from e
 
 
+def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: dict | None = None):
+    """Create inverse frequency weights for balancing the dataset.
+
+    Use `multi_dict` to scale relative weights."""
+    attr_names_samples = np.array([item[attr_name] for item in items])
+    unique_attr_names = np.unique(attr_names_samples).tolist()
+    attr_idx = [unique_attr_names.index(l) for l in attr_names_samples]
+    attr_count = np.array([len(np.where(attr_names_samples == l)[0]) for l in unique_attr_names])
+    weight_attr = 1.0 / attr_count
+    dataset_samples_weight = np.array([weight_attr[l] for l in attr_idx])
+    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+    if multi_dict is not None:
+        # check if all keys are in the multi_dict
+        for k in multi_dict:
+            assert k in unique_attr_names, f"{k} not in {unique_attr_names}"
+        # scale weights
+        multiplier_samples = np.array([multi_dict.get(item[attr_name], 1.0) for item in items])
+        dataset_samples_weight *= multiplier_samples
+    return (
+        torch.from_numpy(dataset_samples_weight).float(),
+        unique_attr_names,
+        np.unique(dataset_samples_weight).tolist(),
+    )
+
+
 class TTSDataset(Dataset):
     def __init__(
         self,
         outputs_per_step: int = 1,
         compute_linear_spec: bool = False,
         ap: AudioProcessor = None,
-        samples: Optional[list[dict]] = None,
+        samples: list[dict] | None = None,
         tokenizer: "TTSTokenizer" = None,
         compute_f0: bool = False,
         compute_energy: bool = False,
-        f0_cache_path: Optional[str] = None,
-        energy_cache_path: Optional[str] = None,
+        f0_cache_path: str | None = None,
+        energy_cache_path: str | None = None,
         return_wav: bool = False,
         batch_group_size: int = 0,
         min_text_len: int = 0,
         max_text_len: int = float("inf"),
         min_audio_len: int = 0,
         max_audio_len: int = float("inf"),
-        phoneme_cache_path: Optional[str] = None,
+        phoneme_cache_path: str | None = None,
         precompute_num_workers: int = 0,
-        speaker_id_mapping: Optional[dict] = None,
-        d_vector_mapping: Optional[dict] = None,
-        language_id_mapping: Optional[dict] = None,
+        speaker_id_mapping: dict | None = None,
+        d_vector_mapping: dict | None = None,
+        language_id_mapping: dict | None = None,
         use_noise_augment: bool = False,
         start_by_longest: bool = False,
     ) -> None:
@@ -206,7 +231,7 @@ def lengths(self) -> list[int]:
             try:
                 audio_len = get_audio_size(wav_file)
             except RuntimeError:
-                logger.warning(f"Failed to compute length for {item['audio_file']}")
+                logger.warning("Failed to compute length for %s", item["audio_file"])
                 audio_len = 0
             lens.append(audio_len)
         return lens
@@ -327,7 +352,7 @@ def _compute_lengths(samples):
             try:
                 audio_length = get_audio_size(item["audio_file"])
             except RuntimeError:
-                logger.warning(f"Failed to compute length, skipping {item['audio_file']}")
+                logger.warning("Failed to compute length, skipping %s", item["audio_file"])
                 continue
             text_lenght = len(item["text"])
             item["audio_length"] = audio_length
@@ -412,14 +437,14 @@ def preprocess_samples(self) -> None:
         self.samples = samples
 
         logger.info("Preprocessing samples")
-        logger.info(f"Max text length: {np.max(text_lengths)}")
-        logger.info(f"Min text length: {np.min(text_lengths)}")
-        logger.info(f"Avg text length: {np.mean(text_lengths)}")
-        logger.info(f"Max audio length: {np.max(audio_lengths)}")
-        logger.info(f"Min audio length: {np.min(audio_lengths)}")
-        logger.info(f"Avg audio length: {np.mean(audio_lengths)}")
+        logger.info("Max text length: %d", np.max(text_lengths))
+        logger.info("Min text length: %d", np.min(text_lengths))
+        logger.info("Avg text length: %.2f", np.mean(text_lengths))
+        logger.info("Max audio length: %.2f", np.max(audio_lengths))
+        logger.info("Min audio length: %.2f", np.min(audio_lengths))
+        logger.info("Avg audio length: %.2f", np.mean(audio_lengths))
         logger.info("Num. instances discarded samples: %d", len(ignore_idx))
-        logger.info(f"Batch group size: {self.batch_group_size}.")
+        logger.info("Batch group size: %d", self.batch_group_size)
 
     @staticmethod
     def _sort_batch(batch, text_lengths):
@@ -615,7 +640,7 @@ class PhonemeDataset(Dataset):
 
     def __init__(
         self,
-        samples: Union[list[dict], list[list]],
+        samples: list[dict] | list[list],
         tokenizer: "TTSTokenizer",
         cache_path: str,
         precompute_num_workers: int = 0,
@@ -719,10 +744,10 @@ class F0Dataset:
 
     def __init__(
         self,
-        samples: Union[list[list], list[dict]],
+        samples: list[list] | list[dict],
         ap: "AudioProcessor",
         audio_config=None,  # pylint: disable=unused-argument
-        cache_path: Optional[str] = None,
+        cache_path: str | None = None,
         precompute_num_workers: int = 0,
         normalize_f0: bool = True,
     ) -> None:
@@ -871,9 +896,9 @@ class EnergyDataset:
 
     def __init__(
         self,
-        samples: Union[list[list], list[dict]],
+        samples: list[list] | list[dict],
         ap: "AudioProcessor",
-        cache_path: Optional[str] = None,
+        cache_path: str | None = None,
         precompute_num_workers=0,
         normalize_energy=True,
     ) -> None:
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index ff1a76e2c9..3a4605275a 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -5,7 +5,6 @@
 import xml.etree.ElementTree as ET
 from glob import glob
 from pathlib import Path
-from typing import List
 
 from tqdm import tqdm
 
@@ -21,7 +20,7 @@ def cml_tts(root_path, meta_file, ignored_speakers=None):
     https://github.com/freds0/CML-TTS-Dataset/"""
     filepath = os.path.join(root_path, meta_file)
     # ensure there are 4 columns for every line
-    with open(filepath, "r", encoding="utf8") as f:
+    with open(filepath, encoding="utf8") as f:
         lines = f.readlines()
     num_cols = len(lines[0].split("|"))  # take the first row as reference
     for idx, line in enumerate(lines[1:]):
@@ -61,7 +60,7 @@ def coqui(root_path, meta_file, ignored_speakers=None):
     """Interal dataset formatter."""
     filepath = os.path.join(root_path, meta_file)
     # ensure there are 4 columns for every line
-    with open(filepath, "r", encoding="utf8") as f:
+    with open(filepath, encoding="utf8") as f:
         lines = f.readlines()
     num_cols = len(lines[0].split("|"))  # take the first row as reference
     for idx, line in enumerate(lines[1:]):
@@ -104,7 +103,7 @@ def tweb(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "tweb"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("\t")
             wav_file = os.path.join(root_path, cols[0] + ".wav")
@@ -118,7 +117,7 @@ def mozilla(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "mozilla"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = cols[1].strip()
@@ -133,7 +132,7 @@ def mozilla_de(root_path, meta_file, **kwargs):  # pylint: disable=unused-argume
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "mozilla"
-    with open(txt_file, "r", encoding="ISO 8859-1") as ttf:
+    with open(txt_file, encoding="ISO 8859-1") as ttf:
         for line in ttf:
             cols = line.strip().split("|")
             wav_file = cols[0].strip()
@@ -177,7 +176,7 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
             if speaker_name in ignored_speakers:
                 continue
         logger.info(csv_file)
-        with open(txt_file, "r", encoding="utf-8") as ttf:
+        with open(txt_file, encoding="utf-8") as ttf:
             for line in ttf:
                 cols = line.split("|")
                 if not meta_files:
@@ -201,7 +200,7 @@ def ljspeech(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "ljspeech"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
@@ -215,7 +214,7 @@ def ljspeech_test(root_path, meta_file, **kwargs):  # pylint: disable=unused-arg
     https://keithito.com/LJ-Speech-Dataset/"""
     txt_file = os.path.join(root_path, meta_file)
     items = []
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         speaker_id = 0
         for idx, line in enumerate(ttf):
             # 2 samples per speaker to avoid eval split issues
@@ -236,7 +235,7 @@ def thorsten(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "thorsten"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
@@ -268,7 +267,7 @@ def ruslan(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "ruslan"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, "RUSLAN", cols[0] + ".wav")
@@ -282,7 +281,7 @@ def css10(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "css10"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, cols[0])
@@ -296,7 +295,7 @@ def nancy(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "nancy"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             utt_id = line.split()[1]
             text = line[line.find('"') + 1 : line.rfind('"') - 1]
@@ -309,7 +308,7 @@ def common_voice(root_path, meta_file, ignored_speakers=None):
     """Normalize the common voice meta data file to TTS format."""
     txt_file = os.path.join(root_path, meta_file)
     items = []
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             if line.startswith("client_id"):
                 continue
@@ -338,7 +337,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None):
 
     for meta_file in meta_files:
         _meta_file = os.path.basename(meta_file).split(".")[0]
-        with open(meta_file, "r", encoding="utf-8") as ttf:
+        with open(meta_file, encoding="utf-8") as ttf:
             for line in ttf:
                 cols = line.split("\t")
                 file_name = cols[0]
@@ -368,7 +367,7 @@ def custom_turkish(root_path, meta_file, **kwargs):  # pylint: disable=unused-ar
     items = []
     speaker_name = "turkish-female"
     skipped_files = []
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0].strip() + ".wav")
@@ -386,7 +385,7 @@ def brspeech(root_path, meta_file, ignored_speakers=None):
     """BRSpeech 3.0 beta"""
     txt_file = os.path.join(root_path, meta_file)
     items = []
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             if line.startswith("wav_filename"):
                 continue
@@ -425,7 +424,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
     """
     file_ext = "flac"
     items = []
-    meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
+    meta_files = glob(f"{os.path.join(root_path, 'txt')}/**/*.txt", recursive=True)
     for meta_file in meta_files:
         _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
         file_id = txt_file.split(".")[0]
@@ -433,7 +432,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
         if isinstance(ignored_speakers, list):
             if speaker_id in ignored_speakers:
                 continue
-        with open(meta_file, "r", encoding="utf-8") as file_text:
+        with open(meta_file, encoding="utf-8") as file_text:
             text = file_text.readlines()[0]
         # p280 has no mic2 recordings
         if speaker_id == "p280":
@@ -452,7 +451,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
 def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None):
     """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
     items = []
-    meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
+    meta_files = glob(f"{os.path.join(root_path, 'txt')}/**/*.txt", recursive=True)
     for meta_file in meta_files:
         _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
         file_id = txt_file.split(".")[0]
@@ -460,7 +459,7 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=Non
         if isinstance(ignored_speakers, list):
             if speaker_id in ignored_speakers:
                 continue
-        with open(meta_file, "r", encoding="utf-8") as file_text:
+        with open(meta_file, encoding="utf-8") as file_text:
             text = file_text.readlines()[0]
         wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav")
         items.append(
@@ -482,7 +481,7 @@ def synpaflex(root_path, metafiles=None, **kwargs):  # pylint: disable=unused-ar
                 os.path.dirname(wav_file), "txt", os.path.basename(wav_file).replace(".wav", ".txt")
             )
         if os.path.exists(txt_file) and os.path.exists(wav_file):
-            with open(txt_file, "r", encoding="utf-8") as file_text:
+            with open(txt_file, encoding="utf-8") as file_text:
                 text = file_text.readlines()[0]
             items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
@@ -500,7 +499,7 @@ def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, igno
         if isinstance(ignored_speakers, list):
             if speaker_id in ignored_speakers:
                 continue
-        with open(meta_file, "r", encoding="utf-8") as file_text:
+        with open(meta_file, encoding="utf-8") as file_text:
             text = file_text.readline().replace("\n", "")
         # ignore sentences that contains digits
         if ignore_digits_sentences and any(map(str.isdigit, text)):
@@ -513,7 +512,7 @@ def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, igno
 def mls(root_path, meta_files=None, ignored_speakers=None):
     """http://www.openslr.org/94/"""
     items = []
-    with open(os.path.join(root_path, meta_files), "r", encoding="utf-8") as meta:
+    with open(os.path.join(root_path, meta_files), encoding="utf-8") as meta:
         for line in meta:
             file, text = line.split("\t")
             text = text[:-1]
@@ -553,7 +552,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
 
     # if not exists meta file, crawl recursively for 'wav' files
     if meta_file is not None:
-        with open(str(meta_file), "r", encoding="utf-8") as f:
+        with open(str(meta_file), encoding="utf-8") as f:
             return [x.strip().split("|") for x in f.readlines()]
 
     elif not cache_to.exists():
@@ -575,7 +574,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
         if cnt < expected_count:
             raise ValueError(f"Found too few instances for Voxceleb. Should be around {expected_count}, is: {cnt}")
 
-    with open(str(cache_to), "r", encoding="utf-8") as f:
+    with open(str(cache_to), encoding="utf-8") as f:
         return [x.strip().split("|") for x in f.readlines()]
 
 
@@ -583,7 +582,7 @@ def emotion(root_path, meta_file, ignored_speakers=None):
     """Generic emotion dataset"""
     txt_file = os.path.join(root_path, meta_file)
     items = []
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             if line.startswith("file_path"):
                 continue
@@ -601,7 +600,7 @@ def emotion(root_path, meta_file, ignored_speakers=None):
     return items
 
 
-def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]:  # pylint: disable=unused-argument
+def baker(root_path: str, meta_file: str, **kwargs) -> list[list[str]]:  # pylint: disable=unused-argument
     """Normalizes the Baker meta data file to TTS format
 
     Args:
@@ -613,7 +612,7 @@ def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]:  # pylin
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "baker"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             wav_name, text = line.rstrip("\n").split("|")
             wav_path = os.path.join(root_path, "clips_22", wav_name)
@@ -626,7 +625,7 @@ def kokoro(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "kokoro"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
@@ -640,7 +639,7 @@ def kss(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "kss"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, cols[0])
@@ -653,7 +652,7 @@ def bel_tts_formatter(root_path, meta_file, **kwargs):  # pylint: disable=unused
     txt_file = os.path.join(root_path, meta_file)
     items = []
     speaker_name = "bel_tts"
-    with open(txt_file, "r", encoding="utf-8") as ttf:
+    with open(txt_file, encoding="utf-8") as ttf:
         for line in ttf:
             cols = line.split("|")
             wav_file = os.path.join(root_path, cols[0])
diff --git a/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/TTS/tts/layers/bark/hubert/kmeans_hubert.py
index 58a614cb87..87be97d5d1 100644
--- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py
+++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py
@@ -7,13 +7,14 @@
 
 # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
 
-
 import torch
 from einops import pack, unpack
 from torch import nn
 from torchaudio.functional import resample
 from transformers import HubertModel
 
+from TTS.utils.generic_utils import exists
+
 
 def round_down_nearest_multiple(num, divisor):
     return num // divisor * divisor
@@ -26,14 +27,6 @@ def curtail_to_multiple(t, mult, from_left=False):
     return t[..., seq_slice]
 
 
-def exists(val):
-    return val is not None
-
-
-def default(val, d):
-    return val if exists(val) else d
-
-
 class CustomHubert(nn.Module):
     """
     checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py
index 65c7800dcf..457a20ea28 100644
--- a/TTS/tts/layers/bark/inference_funcs.py
+++ b/TTS/tts/layers/bark/inference_funcs.py
@@ -2,7 +2,6 @@
 import os
 import re
 from glob import glob
-from typing import Dict, List, Optional, Tuple
 
 import librosa
 import numpy as np
@@ -34,9 +33,9 @@ def _normalize_whitespace(text):
     return re.sub(r"\s+", " ", text).strip()
 
 
-def get_voices(extra_voice_dirs: List[str] = []):  # pylint: disable=dangerous-default-value
+def get_voices(extra_voice_dirs: list[str] = []):  # pylint: disable=dangerous-default-value
     dirs = extra_voice_dirs
-    voices: Dict[str, List[str]] = {}
+    voices: dict[str, list[str]] = {}
     for d in dirs:
         subs = os.listdir(d)
         for sub in subs:
@@ -49,7 +48,7 @@ def get_voices(extra_voice_dirs: List[str] = []):  # pylint: disable=dangerous-d
     return voices
 
 
-def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]:
+def load_npz(npz_file: str) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]:
     x_history = np.load(npz_file)
     semantic = x_history["semantic_prompt"]
     coarse = x_history["coarse_prompt"]
@@ -58,10 +57,8 @@ def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64
 
 
 def load_voice(
-    model, voice: str, extra_voice_dirs: List[str] = []
-) -> Tuple[
-    Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]]
-]:  # pylint: disable=dangerous-default-value
+    model, voice: str, extra_voice_dirs: list[str] = []
+) -> tuple[npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None]:  # pylint: disable=dangerous-default-value
     if voice == "random":
         return None, None, None
 
@@ -206,8 +203,8 @@ def generate_text_semantic(
         semantic_history = None
     encoded_text = np.array(_tokenize(model.tokenizer, text)) + model.config.TEXT_ENCODING_OFFSET
     if len(encoded_text) > 256:
-        p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
-        logger.warning(f"warning, text too long, lopping of last {p}%")
+        p = (len(encoded_text) - 256) / len(encoded_text) * 100
+        logger.warning("warning, text too long, lopping of last %.1f%%", p)
         encoded_text = encoded_text[:256]
     encoded_text = np.pad(
         encoded_text,
diff --git a/TTS/tts/layers/bark/load_model.py b/TTS/tts/layers/bark/load_model.py
index 6b7caab916..dcec5b5bbc 100644
--- a/TTS/tts/layers/bark/load_model.py
+++ b/TTS/tts/layers/bark/load_model.py
@@ -88,7 +88,7 @@ def clear_cuda_cache():
 
 
 def load_model(ckpt_path, device, config, model_type="text"):
-    logger.info(f"loading {model_type} model from {ckpt_path}...")
+    logger.info("loading %s model from %s...", model_type, ckpt_path)
 
     if device == "cpu":
         logger.warning("No GPU being used. Careful, Inference might be extremely slow!")
@@ -108,11 +108,13 @@ def load_model(ckpt_path, device, config, model_type="text"):
         and os.path.exists(ckpt_path)
         and _md5(ckpt_path) != config.REMOTE_MODEL_PATHS[model_type]["checksum"]
     ):
-        logger.warning(f"found outdated {model_type} model, removing...")
+        logger.warning("found outdated %s model, removing...", model_type)
         os.remove(ckpt_path)
     if not os.path.exists(ckpt_path):
-        logger.info(f"{model_type} model not found, downloading...")
-        _download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR)
+        logger.info("%s model not found, downloading...", model_type)
+        # The URL in the config is a 404 and needs to be fixed
+        download_url = config.REMOTE_MODEL_PATHS[model_type]["path"].replace("tree", "resolve")
+        _download(download_url, ckpt_path, config.CACHE_DIR)
 
     checkpoint = torch.load(ckpt_path, map_location=device, weights_only=is_pytorch_at_least_2_4())
     # this is a hack
@@ -148,7 +150,7 @@ def load_model(ckpt_path, device, config, model_type="text"):
     model.load_state_dict(state_dict, strict=False)
     n_params = model.get_num_params()
     val_loss = checkpoint["best_val_loss"].item()
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
+    logger.info("model loaded: %.1fM params, %.3f loss", n_params / 1e6, val_loss)
     model.eval()
     model.to(device)
     del checkpoint, state_dict
diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py
index 68c50dbdbd..4850d0a88b 100644
--- a/TTS/tts/layers/bark/model.py
+++ b/TTS/tts/layers/bark/model.py
@@ -12,18 +12,6 @@
 from torch.nn import functional as F
 
 
-class LayerNorm(nn.Module):
-    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
-
-    def __init__(self, ndim, bias):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(ndim))
-        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
-
-    def forward(self, x):
-        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
-
-
 class CausalSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -119,9 +107,9 @@ def forward(self, x):
 class Block(nn.Module):
     def __init__(self, config, layer_idx):
         super().__init__()
-        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.ln_1 = nn.LayerNorm(config.n_embd, bias=config.bias)
         self.attn = CausalSelfAttention(config)
-        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.ln_2 = nn.LayerNorm(config.n_embd, bias=config.bias)
         self.mlp = MLP(config)
         self.layer_idx = layer_idx
 
@@ -158,7 +146,7 @@ def __init__(self, config):
                 wpe=nn.Embedding(config.block_size, config.n_embd),
                 drop=nn.Dropout(config.dropout),
                 h=nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]),
-                ln_f=LayerNorm(config.n_embd, bias=config.bias),
+                ln_f=nn.LayerNorm(config.n_embd, bias=config.bias),
             )
         )
         self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
@@ -187,9 +175,9 @@ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use
                 assert idx.shape[1] >= 256 + 256 + 1
                 t = idx.shape[1] - 256
             else:
-                assert (
-                    t <= self.config.block_size
-                ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+                assert t <= self.config.block_size, (
+                    f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+                )
 
             # forward the GPT model itself
             if merge_context:
diff --git a/TTS/tts/layers/bark/model_fine.py b/TTS/tts/layers/bark/model_fine.py
index 29126b41ab..20f54d2152 100644
--- a/TTS/tts/layers/bark/model_fine.py
+++ b/TTS/tts/layers/bark/model_fine.py
@@ -101,9 +101,9 @@ def __init__(self, config):
     def forward(self, pred_idx, idx):
         device = idx.device
         b, t, codes = idx.size()
-        assert (
-            t <= self.config.block_size
-        ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        assert t <= self.config.block_size, (
+            f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        )
         assert pred_idx > 0, "cannot predict 0th codebook"
         assert codes == self.n_codes_total, (b, t, codes)
         pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)  # shape (1, t)
diff --git a/TTS/tts/layers/delightful_tts/acoustic_model.py b/TTS/tts/layers/delightful_tts/acoustic_model.py
index 3c0e3a3a76..9110ff5fd0 100644
--- a/TTS/tts/layers/delightful_tts/acoustic_model.py
+++ b/TTS/tts/layers/delightful_tts/acoustic_model.py
@@ -1,6 +1,6 @@
 ### credit: https://github.com/dunky11/voicesmith
 import logging
-from typing import Callable, Dict, Tuple
+from collections.abc import Callable
 
 import torch
 import torch.nn.functional as F
@@ -12,7 +12,6 @@
 from TTS.tts.layers.delightful_tts.encoders import (
     PhonemeLevelProsodyEncoder,
     UtteranceLevelProsodyEncoder,
-    get_mask_from_lengths,
 )
 from TTS.tts.layers.delightful_tts.energy_adaptor import EnergyAdaptor
 from TTS.tts.layers.delightful_tts.networks import EmbeddingPadded, positional_encoding
@@ -20,7 +19,7 @@
 from TTS.tts.layers.delightful_tts.pitch_adaptor import PitchAdaptor
 from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
 from TTS.tts.layers.generic.aligner import AlignmentNetwork
-from TTS.tts.utils.helpers import generate_path, sequence_mask
+from TTS.tts.utils.helpers import expand_encoder_outputs, generate_attention, sequence_mask
 
 logger = logging.getLogger(__name__)
 
@@ -178,7 +177,7 @@ def init_multispeaker(self, args: Coqpit):  # pylint: disable=unused-argument
             self._init_d_vector()
 
     @staticmethod
-    def _set_cond_input(aux_input: Dict):
+    def _set_cond_input(aux_input: dict):
         """Set the speaker conditioning input based on the multi-speaker mode."""
         sid, g, lid, durations = None, None, None, None
         if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None:
@@ -195,11 +194,11 @@ def _set_cond_input(aux_input: Dict):
 
         return sid, g, lid, durations
 
-    def get_aux_input(self, aux_input: Dict):
+    def get_aux_input(self, aux_input: dict):
         sid, g, lid, _ = self._set_cond_input(aux_input)
         return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid}
 
-    def _set_speaker_input(self, aux_input: Dict):
+    def _set_speaker_input(self, aux_input: dict):
         d_vectors = aux_input.get("d_vectors", None)
         speaker_ids = aux_input.get("speaker_ids", None)
 
@@ -231,42 +230,6 @@ def _init_d_vector(self):
             raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.")
         self.embedded_speaker_dim = self.args.d_vector_dim
 
-    @staticmethod
-    def generate_attn(dr, x_mask, y_mask=None):
-        """Generate an attention mask from the linear scale durations.
-
-        Args:
-            dr (Tensor): Linear scale durations.
-            x_mask (Tensor): Mask for the input (character) sequence.
-            y_mask (Tensor): Mask for the output (spectrogram) sequence. Compute it from the predicted durations
-                if None. Defaults to None.
-
-        Shapes
-           - dr: :math:`(B, T_{en})`
-           - x_mask: :math:`(B, T_{en})`
-           - y_mask: :math:`(B, T_{de})`
-        """
-        # compute decode mask from the durations
-        if y_mask is None:
-            y_lengths = dr.sum(1).long()
-            y_lengths[y_lengths < 1] = 1
-            y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype)
-        attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
-        attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
-        return attn
-
-    def _expand_encoder_with_durations(
-        self,
-        o_en: torch.FloatTensor,
-        dr: torch.IntTensor,
-        x_mask: torch.IntTensor,
-        y_lengths: torch.IntTensor,
-    ):
-        y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype)
-        attn = self.generate_attn(dr, x_mask, y_mask)
-        o_en_ex = torch.einsum("kmn, kjm -> kjn", [attn.float(), o_en])
-        return y_mask, o_en_ex, attn.transpose(1, 2)
-
     def _forward_aligner(
         self,
         x: torch.FloatTensor,
@@ -274,7 +237,7 @@ def _forward_aligner(
         x_mask: torch.IntTensor,
         y_mask: torch.IntTensor,
         attn_priors: torch.FloatTensor,
-    ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Aligner forward pass.
 
         1. Compute a mask to apply to the attention map.
@@ -335,13 +298,13 @@ def forward(
         use_ground_truth: bool = True,
         d_vectors: torch.Tensor = None,
         speaker_idx: torch.Tensor = None,
-    ) -> Dict[str, torch.Tensor]:
+    ) -> dict[str, torch.Tensor]:
         sid, g, lid, _ = self._set_cond_input(  # pylint: disable=unused-variable
             {"d_vectors": d_vectors, "speaker_ids": speaker_idx}
         )  # pylint: disable=unused-variable
 
-        src_mask = get_mask_from_lengths(src_lens)  # [B, T_src]
-        mel_mask = get_mask_from_lengths(mel_lens)  # [B, T_mel]
+        src_mask = ~sequence_mask(src_lens)  # [B, T_src]
+        mel_mask = ~sequence_mask(mel_lens)  # [B, T_mel]
 
         # Token embeddings
         token_embeddings = self.src_word_emb(tokens)  # [B, T_src, C_hidden]
@@ -420,8 +383,8 @@ def forward(
         encoder_outputs = encoder_outputs.transpose(1, 2) + pitch_emb + energy_emb
         log_duration_prediction = self.duration_predictor(x=encoder_outputs_res.detach(), mask=src_mask)
 
-        mel_pred_mask, encoder_outputs_ex, alignments = self._expand_encoder_with_durations(
-            o_en=encoder_outputs, y_lengths=mel_lens, dr=dr, x_mask=~src_mask[:, None]
+        encoder_outputs_ex, alignments, mel_pred_mask = expand_encoder_outputs(
+            encoder_outputs, y_lengths=mel_lens, duration=dr, x_mask=~src_mask[:, None]
         )
 
         x = self.decoder(
@@ -435,7 +398,7 @@ def forward(
         dr = torch.log(dr + 1)
 
         dr_pred = torch.exp(log_duration_prediction) - 1
-        alignments_dp = self.generate_attn(dr_pred, src_mask.unsqueeze(1), mel_pred_mask)  # [B, T_max, T_max2']
+        alignments_dp = generate_attention(dr_pred, src_mask.unsqueeze(1), mel_pred_mask)  # [B, T_max, T_max2']
 
         return {
             "model_outputs": x,
@@ -448,7 +411,7 @@ def forward(
             "p_prosody_pred": p_prosody_pred,
             "p_prosody_ref": p_prosody_ref,
             "alignments_dp": alignments_dp,
-            "alignments": alignments,  # [B, T_de, T_en]
+            "alignments": alignments.transpose(1, 2),  # [B, T_de, T_en]
             "aligner_soft": aligner_soft,
             "aligner_mas": aligner_mas,
             "aligner_durations": aligner_durations,
@@ -458,7 +421,7 @@ def forward(
             "spk_emb": speaker_embedding,
         }
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         tokens: torch.Tensor,
@@ -469,7 +432,7 @@ def inference(
         pitch_transform: Callable = None,
         energy_transform: Callable = None,
     ) -> torch.Tensor:
-        src_mask = get_mask_from_lengths(torch.tensor([tokens.shape[1]], dtype=torch.int64, device=tokens.device))
+        src_mask = ~sequence_mask(torch.tensor([tokens.shape[1]], dtype=torch.int64, device=tokens.device))
         src_lens = torch.tensor(tokens.shape[1:2]).to(tokens.device)  # pylint: disable=unused-variable
         sid, g, lid, _ = self._set_cond_input(  # pylint: disable=unused-variable
             {"d_vectors": d_vectors, "speaker_ids": speaker_idx}
@@ -536,11 +499,11 @@ def inference(
         duration_pred = torch.round(duration_pred)  # -> [B, T_src]
         mel_lens = duration_pred.sum(1)  # -> [B,]
 
-        _, encoder_outputs_ex, alignments = self._expand_encoder_with_durations(
-            o_en=encoder_outputs, y_lengths=mel_lens, dr=duration_pred.squeeze(1), x_mask=~src_mask[:, None]
+        encoder_outputs_ex, alignments, _ = expand_encoder_outputs(
+            encoder_outputs, y_lengths=mel_lens, duration=duration_pred.squeeze(1), x_mask=~src_mask[:, None]
         )
 
-        mel_mask = get_mask_from_lengths(
+        mel_mask = ~sequence_mask(
             torch.tensor([encoder_outputs_ex.shape[2]], dtype=torch.int64, device=encoder_outputs_ex.device)
         )
 
@@ -557,7 +520,7 @@ def inference(
         x = self.to_mel(x)
         outputs = {
             "model_outputs": x,
-            "alignments": alignments,
+            "alignments": alignments.transpose(1, 2),
             # "pitch": pitch_emb_pred,
             "durations": duration_pred,
             "pitch": pitch_pred,
diff --git a/TTS/tts/layers/delightful_tts/conformer.py b/TTS/tts/layers/delightful_tts/conformer.py
index b2175b3b96..227a871c69 100644
--- a/TTS/tts/layers/delightful_tts/conformer.py
+++ b/TTS/tts/layers/delightful_tts/conformer.py
@@ -1,20 +1,14 @@
 ### credit: https://github.com/dunky11/voicesmith
 import math
-from typing import Tuple
 
 import torch
 import torch.nn as nn  # pylint: disable=consider-using-from-import
 import torch.nn.functional as F
 
-from TTS.tts.layers.delightful_tts.conv_layers import Conv1dGLU, DepthWiseConv1d, PointwiseConv1d
+from TTS.tts.layers.delightful_tts.conv_layers import Conv1dGLU, DepthWiseConv1d, PointwiseConv1d, calc_same_padding
 from TTS.tts.layers.delightful_tts.networks import GLUActivation
 
 
-def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
-    pad = kernel_size // 2
-    return (pad, pad - (kernel_size + 1) % 2)
-
-
 class Conformer(nn.Module):
     def __init__(
         self,
@@ -322,7 +316,7 @@ def forward(
         value: torch.Tensor,
         mask: torch.Tensor,
         encoding: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         batch_size, seq_length, _ = key.size()  # pylint: disable=unused-variable
         encoding = encoding[:, : key.shape[1]]
         encoding = encoding.repeat(batch_size, 1, 1)
@@ -378,7 +372,7 @@ def forward(
         value: torch.Tensor,
         pos_embedding: torch.Tensor,
         mask: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         batch_size = query.shape[0]
         query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head)
         key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
@@ -411,40 +405,3 @@ def _relative_shift(self, pos_score: torch.Tensor) -> torch.Tensor:  # pylint: d
         padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1)
         pos_score = padded_pos_score[:, :, 1:].view_as(pos_score)
         return pos_score
-
-
-class MultiHeadAttention(nn.Module):
-    """
-    input:
-        query --- [N, T_q, query_dim]
-        key --- [N, T_k, key_dim]
-    output:
-        out --- [N, T_q, num_units]
-    """
-
-    def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int):
-        super().__init__()
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.key_dim = key_dim
-
-        self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
-        self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
-        self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
-
-    def forward(self, query: torch.Tensor, key: torch.Tensor) -> torch.Tensor:
-        querys = self.W_query(query)  # [N, T_q, num_units]
-        keys = self.W_key(key)  # [N, T_k, num_units]
-        values = self.W_value(key)
-        split_size = self.num_units // self.num_heads
-        querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0)  # [h, N, T_q, num_units/h]
-        keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
-        values = torch.stack(torch.split(values, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
-        # score = softmax(QK^T / (d_k ** 0.5))
-        scores = torch.matmul(querys, keys.transpose(2, 3))  # [h, N, T_q, T_k]
-        scores = scores / (self.key_dim**0.5)
-        scores = F.softmax(scores, dim=3)
-        # out = score * V
-        out = torch.matmul(scores, values)  # [h, N, T_q, num_units/h]
-        out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0)  # [N, T_q, num_units]
-        return out
diff --git a/TTS/tts/layers/delightful_tts/conv_layers.py b/TTS/tts/layers/delightful_tts/conv_layers.py
index fb9aa4495f..5cf41d4ff6 100644
--- a/TTS/tts/layers/delightful_tts/conv_layers.py
+++ b/TTS/tts/layers/delightful_tts/conv_layers.py
@@ -1,14 +1,9 @@
-from typing import Tuple
-
 import torch
 import torch.nn as nn  # pylint: disable=consider-using-from-import
 import torch.nn.functional as F
-from torch.nn.utils import parametrize
-
-from TTS.tts.layers.delightful_tts.kernel_predictor import KernelPredictor
 
 
-def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
+def calc_same_padding(kernel_size: int) -> tuple[int, int]:
     pad = kernel_size // 2
     return (pad, pad - (kernel_size + 1) % 2)
 
@@ -55,7 +50,7 @@ def __init__(
         w_init_gain="linear",
         use_weight_norm=False,
     ):
-        super(ConvNorm, self).__init__()  # pylint: disable=super-with-arguments
+        super().__init__()
         if padding is None:
             assert kernel_size % 2 == 1
             padding = int(dilation * (kernel_size - 1) / 2)
@@ -97,7 +92,7 @@ def __init__(
         lstm_type="bilstm",
         use_linear=True,
     ):
-        super(ConvLSTMLinear, self).__init__()  # pylint: disable=super-with-arguments
+        super().__init__()
         self.out_dim = out_dim
         self.lstm_type = lstm_type
         self.use_linear = use_linear
@@ -530,142 +525,3 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.addcoords(x)
         x = self.conv(x)
         return x
-
-
-class LVCBlock(torch.nn.Module):
-    """the location-variable convolutions"""
-
-    def __init__(  # pylint: disable=dangerous-default-value
-        self,
-        in_channels,
-        cond_channels,
-        stride,
-        dilations=[1, 3, 9, 27],
-        lReLU_slope=0.2,
-        conv_kernel_size=3,
-        cond_hop_length=256,
-        kpnet_hidden_channels=64,
-        kpnet_conv_size=3,
-        kpnet_dropout=0.0,
-    ):
-        super().__init__()
-
-        self.cond_hop_length = cond_hop_length
-        self.conv_layers = len(dilations)
-        self.conv_kernel_size = conv_kernel_size
-
-        self.kernel_predictor = KernelPredictor(
-            cond_channels=cond_channels,
-            conv_in_channels=in_channels,
-            conv_out_channels=2 * in_channels,
-            conv_layers=len(dilations),
-            conv_kernel_size=conv_kernel_size,
-            kpnet_hidden_channels=kpnet_hidden_channels,
-            kpnet_conv_size=kpnet_conv_size,
-            kpnet_dropout=kpnet_dropout,
-            kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope},
-        )
-
-        self.convt_pre = nn.Sequential(
-            nn.LeakyReLU(lReLU_slope),
-            nn.utils.parametrizations.weight_norm(
-                nn.ConvTranspose1d(
-                    in_channels,
-                    in_channels,
-                    2 * stride,
-                    stride=stride,
-                    padding=stride // 2 + stride % 2,
-                    output_padding=stride % 2,
-                )
-            ),
-        )
-
-        self.conv_blocks = nn.ModuleList()
-        for dilation in dilations:
-            self.conv_blocks.append(
-                nn.Sequential(
-                    nn.LeakyReLU(lReLU_slope),
-                    nn.utils.parametrizations.weight_norm(
-                        nn.Conv1d(
-                            in_channels,
-                            in_channels,
-                            conv_kernel_size,
-                            padding=dilation * (conv_kernel_size - 1) // 2,
-                            dilation=dilation,
-                        )
-                    ),
-                    nn.LeakyReLU(lReLU_slope),
-                )
-            )
-
-    def forward(self, x, c):
-        """forward propagation of the location-variable convolutions.
-        Args:
-            x (Tensor): the input sequence (batch, in_channels, in_length)
-            c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
-
-        Returns:
-            Tensor: the output sequence (batch, in_channels, in_length)
-        """
-        _, in_channels, _ = x.shape  # (B, c_g, L')
-
-        x = self.convt_pre(x)  # (B, c_g, stride * L')
-        kernels, bias = self.kernel_predictor(c)
-
-        for i, conv in enumerate(self.conv_blocks):
-            output = conv(x)  # (B, c_g, stride * L')
-
-            k = kernels[:, i, :, :, :, :]  # (B, 2 * c_g, c_g, kernel_size, cond_length)
-            b = bias[:, i, :, :]  # (B, 2 * c_g, cond_length)
-
-            output = self.location_variable_convolution(
-                output, k, b, hop_size=self.cond_hop_length
-            )  # (B, 2 * c_g, stride * L'): LVC
-            x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh(
-                output[:, in_channels:, :]
-            )  # (B, c_g, stride * L'): GAU
-
-        return x
-
-    def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256):  # pylint: disable=no-self-use
-        """perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
-        Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
-        Args:
-            x (Tensor): the input sequence (batch, in_channels, in_length).
-            kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
-            bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
-            dilation (int): the dilation of convolution.
-            hop_size (int): the hop_size of the conditioning sequence.
-        Returns:
-            (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
-        """
-        batch, _, in_length = x.shape
-        batch, _, out_channels, kernel_size, kernel_length = kernel.shape
-        assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched"
-
-        padding = dilation * int((kernel_size - 1) / 2)
-        x = F.pad(x, (padding, padding), "constant", 0)  # (batch, in_channels, in_length + 2*padding)
-        x = x.unfold(2, hop_size + 2 * padding, hop_size)  # (batch, in_channels, kernel_length, hop_size + 2*padding)
-
-        if hop_size < dilation:
-            x = F.pad(x, (0, dilation), "constant", 0)
-        x = x.unfold(
-            3, dilation, dilation
-        )  # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
-        x = x[:, :, :, :, :hop_size]
-        x = x.transpose(3, 4)  # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
-        x = x.unfold(4, kernel_size, 1)  # (batch, in_channels, kernel_length, dilation, _, kernel_size)
-
-        o = torch.einsum("bildsk,biokl->bolsd", x, kernel)
-        o = o.to(memory_format=torch.channels_last_3d)
-        bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d)
-        o = o + bias
-        o = o.contiguous().view(batch, out_channels, -1)
-
-        return o
-
-    def remove_weight_norm(self):
-        self.kernel_predictor.remove_weight_norm()
-        parametrize.remove_parametrizations(self.convt_pre[1], "weight")
-        for block in self.conv_blocks:
-            parametrize.remove_parametrizations(block[1], "weight")
diff --git a/TTS/tts/layers/delightful_tts/encoders.py b/TTS/tts/layers/delightful_tts/encoders.py
index 0878f0677a..31bab8cc97 100644
--- a/TTS/tts/layers/delightful_tts/encoders.py
+++ b/TTS/tts/layers/delightful_tts/encoders.py
@@ -1,5 +1,3 @@
-from typing import List, Tuple, Union
-
 import torch
 import torch.nn as nn  # pylint: disable=consider-using-from-import
 import torch.nn.functional as F
@@ -7,14 +5,7 @@
 from TTS.tts.layers.delightful_tts.conformer import ConformerMultiHeadedSelfAttention
 from TTS.tts.layers.delightful_tts.conv_layers import CoordConv1d
 from TTS.tts.layers.delightful_tts.networks import STL
-
-
-def get_mask_from_lengths(lengths: torch.Tensor) -> torch.Tensor:
-    batch_size = lengths.shape[0]
-    max_len = torch.max(lengths).item()
-    ids = torch.arange(0, max_len, device=lengths.device).unsqueeze(0).expand(batch_size, -1)
-    mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
-    return mask
+from TTS.tts.utils.helpers import sequence_mask
 
 
 def stride_lens(lens: torch.Tensor, stride: int = 2) -> torch.Tensor:
@@ -43,9 +34,9 @@ class ReferenceEncoder(nn.Module):
     def __init__(
         self,
         num_mels: int,
-        ref_enc_filters: List[Union[int, int, int, int, int, int]],
+        ref_enc_filters: list[int | int | int | int | int | int],
         ref_enc_size: int,
-        ref_enc_strides: List[Union[int, int, int, int, int]],
+        ref_enc_strides: list[int | int | int | int | int],
         ref_enc_gru_size: int,
     ):
         super().__init__()
@@ -87,13 +78,13 @@ def __init__(
             batch_first=True,
         )
 
-    def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         inputs --- [N,  n_mels, timesteps]
         outputs --- [N, E//2]
         """
 
-        mel_masks = get_mask_from_lengths(mel_lens).unsqueeze(1)
+        mel_masks = ~sequence_mask(mel_lens).unsqueeze(1)
         x = x.masked_fill(mel_masks, 0)
         for conv, norm in zip(self.convs, self.norms):
             x = conv(x)
@@ -103,7 +94,7 @@ def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> Tuple[torch.Tensor
         for _ in range(2):
             mel_lens = stride_lens(mel_lens)
 
-        mel_masks = get_mask_from_lengths(mel_lens)
+        mel_masks = ~sequence_mask(mel_lens)
 
         x = x.masked_fill(mel_masks.unsqueeze(1), 0)
         x = x.permute((0, 2, 1))
@@ -127,9 +118,9 @@ class UtteranceLevelProsodyEncoder(nn.Module):
     def __init__(
         self,
         num_mels: int,
-        ref_enc_filters: List[Union[int, int, int, int, int, int]],
+        ref_enc_filters: list[int | int | int | int | int | int],
         ref_enc_size: int,
-        ref_enc_strides: List[Union[int, int, int, int, int]],
+        ref_enc_strides: list[int | int | int | int | int],
         ref_enc_gru_size: int,
         dropout: float,
         n_hidden: int,
@@ -199,9 +190,9 @@ class PhonemeLevelProsodyEncoder(nn.Module):
     def __init__(
         self,
         num_mels: int,
-        ref_enc_filters: List[Union[int, int, int, int, int, int]],
+        ref_enc_filters: list[int | int | int | int | int | int],
         ref_enc_size: int,
-        ref_enc_strides: List[Union[int, int, int, int, int]],
+        ref_enc_strides: list[int | int | int | int | int],
         ref_enc_gru_size: int,
         dropout: float,
         n_hidden: int,
diff --git a/TTS/tts/layers/delightful_tts/energy_adaptor.py b/TTS/tts/layers/delightful_tts/energy_adaptor.py
index ea0d1e4721..d2b4b0ffa8 100644
--- a/TTS/tts/layers/delightful_tts/energy_adaptor.py
+++ b/TTS/tts/layers/delightful_tts/energy_adaptor.py
@@ -1,4 +1,4 @@
-from typing import Callable, Tuple
+from collections.abc import Callable
 
 import torch
 import torch.nn as nn  # pylint: disable=consider-using-from-import
@@ -59,7 +59,7 @@ def __init__(
 
     def get_energy_embedding_train(
         self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Shapes:
             x: :math: `[B, T_src, C]`
diff --git a/TTS/tts/layers/delightful_tts/kernel_predictor.py b/TTS/tts/layers/delightful_tts/kernel_predictor.py
deleted file mode 100644
index 96c550b6c2..0000000000
--- a/TTS/tts/layers/delightful_tts/kernel_predictor.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import torch.nn as nn  # pylint: disable=consider-using-from-import
-from torch.nn.utils import parametrize
-
-
-class KernelPredictor(nn.Module):
-    """Kernel predictor for the location-variable convolutions
-
-    Args:
-            cond_channels (int): number of channel for the conditioning sequence,
-            conv_in_channels (int): number of channel for the input sequence,
-            conv_out_channels (int): number of channel for the output sequence,
-            conv_layers (int): number of layers
-
-    """
-
-    def __init__(  # pylint: disable=dangerous-default-value
-        self,
-        cond_channels,
-        conv_in_channels,
-        conv_out_channels,
-        conv_layers,
-        conv_kernel_size=3,
-        kpnet_hidden_channels=64,
-        kpnet_conv_size=3,
-        kpnet_dropout=0.0,
-        kpnet_nonlinear_activation="LeakyReLU",
-        kpnet_nonlinear_activation_params={"negative_slope": 0.1},
-    ):
-        super().__init__()
-
-        self.conv_in_channels = conv_in_channels
-        self.conv_out_channels = conv_out_channels
-        self.conv_kernel_size = conv_kernel_size
-        self.conv_layers = conv_layers
-
-        kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers  # l_w
-        kpnet_bias_channels = conv_out_channels * conv_layers  # l_b
-
-        self.input_conv = nn.Sequential(
-            nn.utils.parametrizations.weight_norm(
-                nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)
-            ),
-            getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
-        )
-
-        self.residual_convs = nn.ModuleList()
-        padding = (kpnet_conv_size - 1) // 2
-        for _ in range(3):
-            self.residual_convs.append(
-                nn.Sequential(
-                    nn.Dropout(kpnet_dropout),
-                    nn.utils.parametrizations.weight_norm(
-                        nn.Conv1d(
-                            kpnet_hidden_channels,
-                            kpnet_hidden_channels,
-                            kpnet_conv_size,
-                            padding=padding,
-                            bias=True,
-                        )
-                    ),
-                    getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
-                    nn.utils.parametrizations.weight_norm(
-                        nn.Conv1d(
-                            kpnet_hidden_channels,
-                            kpnet_hidden_channels,
-                            kpnet_conv_size,
-                            padding=padding,
-                            bias=True,
-                        )
-                    ),
-                    getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
-                )
-            )
-        self.kernel_conv = nn.utils.parametrizations.weight_norm(
-            nn.Conv1d(
-                kpnet_hidden_channels,
-                kpnet_kernel_channels,
-                kpnet_conv_size,
-                padding=padding,
-                bias=True,
-            )
-        )
-        self.bias_conv = nn.utils.parametrizations.weight_norm(
-            nn.Conv1d(
-                kpnet_hidden_channels,
-                kpnet_bias_channels,
-                kpnet_conv_size,
-                padding=padding,
-                bias=True,
-            )
-        )
-
-    def forward(self, c):
-        """
-        Args:
-            c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
-        """
-        batch, _, cond_length = c.shape
-        c = self.input_conv(c)
-        for residual_conv in self.residual_convs:
-            residual_conv.to(c.device)
-            c = c + residual_conv(c)
-        k = self.kernel_conv(c)
-        b = self.bias_conv(c)
-        kernels = k.contiguous().view(
-            batch,
-            self.conv_layers,
-            self.conv_in_channels,
-            self.conv_out_channels,
-            self.conv_kernel_size,
-            cond_length,
-        )
-        bias = b.contiguous().view(
-            batch,
-            self.conv_layers,
-            self.conv_out_channels,
-            cond_length,
-        )
-
-        return kernels, bias
-
-    def remove_weight_norm(self):
-        parametrize.remove_parametrizations(self.input_conv[0], "weight")
-        parametrize.remove_parametrizations(self.kernel_conv, "weight")
-        parametrize.remove_parametrizations(self.bias_conv, "weight")
-        for block in self.residual_convs:
-            parametrize.remove_parametrizations(block[1], "weight")
-            parametrize.remove_parametrizations(block[3], "weight")
diff --git a/TTS/tts/layers/delightful_tts/networks.py b/TTS/tts/layers/delightful_tts/networks.py
index 4305022f18..93b65a2a74 100644
--- a/TTS/tts/layers/delightful_tts/networks.py
+++ b/TTS/tts/layers/delightful_tts/networks.py
@@ -1,5 +1,4 @@
 import math
-from typing import Tuple
 
 import numpy as np
 import torch
@@ -9,7 +8,7 @@
 from TTS.tts.layers.delightful_tts.conv_layers import ConvNorm
 
 
-def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor:
+def initialize_embeddings(shape: tuple[int]) -> torch.Tensor:
     assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..."
     # Kaiming initialization
     return torch.randn(shape) * np.sqrt(2 / shape[1])
@@ -52,7 +51,7 @@ def __init__(
         kernel_size=3,
         use_partial_padding=False,  # pylint: disable=unused-argument
     ):
-        super(BottleneckLayer, self).__init__()  # pylint: disable=super-with-arguments
+        super().__init__()
 
         self.reduction_factor = reduction_factor
         reduced_dim = int(in_dim / reduction_factor)
@@ -195,7 +194,7 @@ class STL(nn.Module):
     """
 
     def __init__(self, n_hidden: int, token_num: int):
-        super(STL, self).__init__()  # pylint: disable=super-with-arguments
+        super().__init__()
 
         num_heads = 1
         E = n_hidden
diff --git a/TTS/tts/layers/delightful_tts/pitch_adaptor.py b/TTS/tts/layers/delightful_tts/pitch_adaptor.py
index 9031369e0f..14e751d2e2 100644
--- a/TTS/tts/layers/delightful_tts/pitch_adaptor.py
+++ b/TTS/tts/layers/delightful_tts/pitch_adaptor.py
@@ -1,4 +1,4 @@
-from typing import Callable, Tuple
+from collections.abc import Callable
 
 import torch
 import torch.nn as nn  # pylint: disable=consider-using-from-import
@@ -58,7 +58,7 @@ def __init__(
 
     def get_pitch_embedding_train(
         self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Shapes:
             x: :math: `[B, T_src, C]`
diff --git a/TTS/tts/layers/feed_forward/encoder.py b/TTS/tts/layers/feed_forward/encoder.py
index caf939ffc7..2d08f03c2d 100644
--- a/TTS/tts/layers/feed_forward/encoder.py
+++ b/TTS/tts/layers/feed_forward/encoder.py
@@ -143,9 +143,9 @@ def __init__(
         elif encoder_type.lower() == "residual_conv_bn":
             self.encoder = ResidualConv1dBNEncoder(in_hidden_channels, out_channels, in_hidden_channels, encoder_params)
         elif encoder_type.lower() == "fftransformer":
-            assert (
-                in_hidden_channels == out_channels
-            ), "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'"
+            assert in_hidden_channels == out_channels, (
+                "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'"
+            )
             # pylint: disable=unexpected-keyword-arg
             self.encoder = FFTransformerBlock(in_hidden_channels, **encoder_params)
         else:
diff --git a/TTS/tts/layers/generic/aligner.py b/TTS/tts/layers/generic/aligner.py
index baa6f0e9c4..480c48f9a4 100644
--- a/TTS/tts/layers/generic/aligner.py
+++ b/TTS/tts/layers/generic/aligner.py
@@ -1,5 +1,3 @@
-from typing import Tuple
-
 import torch
 from torch import nn
 
@@ -68,7 +66,7 @@ def init_layers(self):
 
     def forward(
         self, queries: torch.tensor, keys: torch.tensor, mask: torch.tensor = None, attn_prior: torch.tensor = None
-    ) -> Tuple[torch.tensor, torch.tensor]:
+    ) -> tuple[torch.tensor, torch.tensor]:
         """Forward pass of the aligner encoder.
         Shapes:
             - queries: :math:`[B, C, T_de]`
diff --git a/TTS/tts/layers/generic/pos_encoding.py b/TTS/tts/layers/generic/pos_encoding.py
index 913add0d14..7765e224aa 100644
--- a/TTS/tts/layers/generic/pos_encoding.py
+++ b/TTS/tts/layers/generic/pos_encoding.py
@@ -18,9 +18,7 @@ class PositionalEncoding(nn.Module):
     def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False):
         super().__init__()
         if channels % 2 != 0:
-            raise ValueError(
-                "Cannot use sin/cos positional encoding with " "odd channels (got channels={:d})".format(channels)
-            )
+            raise ValueError(f"Cannot use sin/cos positional encoding with odd channels (got channels={channels:d})")
         self.use_scale = use_scale
         if use_scale:
             self.scale = torch.nn.Parameter(torch.ones(1))
diff --git a/TTS/tts/layers/generic/transformer.py b/TTS/tts/layers/generic/transformer.py
index 9b7ecee2ba..2fe9bcc408 100644
--- a/TTS/tts/layers/generic/transformer.py
+++ b/TTS/tts/layers/generic/transformer.py
@@ -70,9 +70,7 @@ def forward(self, x, mask=None, g=None):  # pylint: disable=unused-argument
 
 
 class FFTDurationPredictor:
-    def __init__(
-        self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None
-    ):  # pylint: disable=unused-argument
+    def __init__(self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None):  # pylint: disable=unused-argument
         self.fft = FFTransformerBlock(in_channels, num_heads, hidden_channels, num_layers, dropout_p)
         self.proj = nn.Linear(in_channels, 1)
 
diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index 5ebed81dda..1e744d62cf 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -309,6 +309,24 @@ def forward(self, attn_logprob, in_lens, out_lens):
         return total_loss
 
 
+class NLLLoss(nn.Module):
+    """Negative log likelihood loss."""
+
+    def forward(self, log_prob: torch.Tensor) -> dict:  # pylint: disable=no-self-use
+        """Compute the loss.
+
+        Args:
+            logits (Tensor): [B, T, D]
+
+        Returns:
+            Tensor: [1]
+
+        """
+        return_dict = {}
+        return_dict["loss"] = -log_prob.mean()
+        return return_dict
+
+
 ########################
 # MODEL LOSS LAYERS
 ########################
@@ -619,6 +637,28 @@ def forward(
         return {"loss": loss, "loss_l1": spec_loss, "loss_ssim": ssim_loss, "loss_dur": dur_loss, "mdn_loss": mdn_loss}
 
 
+def feature_loss(feats_real, feats_generated):
+    loss = 0
+    for dr, dg in zip(feats_real, feats_generated):
+        for rl, gl in zip(dr, dg):
+            rl = rl.float().detach()
+            gl = gl.float()
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss * 2
+
+
+def generator_loss(scores_fake):
+    loss = 0
+    gen_losses = []
+    for dg in scores_fake:
+        dg = dg.float()
+        l = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l)
+        loss += l
+
+    return loss, gen_losses
+
+
 class VitsGeneratorLoss(nn.Module):
     def __init__(self, c: Coqpit):
         super().__init__()
@@ -640,28 +680,6 @@ def __init__(self, c: Coqpit):
             do_amp_to_db=True,
         )
 
-    @staticmethod
-    def feature_loss(feats_real, feats_generated):
-        loss = 0
-        for dr, dg in zip(feats_real, feats_generated):
-            for rl, gl in zip(dr, dg):
-                rl = rl.float().detach()
-                gl = gl.float()
-                loss += torch.mean(torch.abs(rl - gl))
-        return loss * 2
-
-    @staticmethod
-    def generator_loss(scores_fake):
-        loss = 0
-        gen_losses = []
-        for dg in scores_fake:
-            dg = dg.float()
-            l = torch.mean((1 - dg) ** 2)
-            gen_losses.append(l)
-            loss += l
-
-        return loss, gen_losses
-
     @staticmethod
     def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
         """
@@ -722,10 +740,8 @@ def forward(
             self.kl_loss(z_p=z_p, logs_q=logs_q, m_p=m_p, logs_p=logs_p, z_mask=z_mask.unsqueeze(1))
             * self.kl_loss_alpha
         )
-        loss_feat = (
-            self.feature_loss(feats_real=feats_disc_real, feats_generated=feats_disc_fake) * self.feat_loss_alpha
-        )
-        loss_gen = self.generator_loss(scores_fake=scores_disc_fake)[0] * self.gen_loss_alpha
+        loss_feat = feature_loss(feats_real=feats_disc_real, feats_generated=feats_disc_fake) * self.feat_loss_alpha
+        loss_gen = generator_loss(scores_fake=scores_disc_fake)[0] * self.gen_loss_alpha
         loss_mel = torch.nn.functional.l1_loss(mel_slice, mel_slice_hat) * self.mel_loss_alpha
         loss_duration = torch.sum(loss_duration.float()) * self.dur_loss_alpha
         loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration
@@ -779,6 +795,15 @@ def forward(self, scores_disc_real, scores_disc_fake):
         return return_dict
 
 
+def _binary_alignment_loss(alignment_hard, alignment_soft):
+    """Binary loss that forces soft alignments to match the hard alignments.
+
+    Explained in `https://arxiv.org/pdf/2108.10447.pdf`.
+    """
+    log_sum = torch.log(torch.clamp(alignment_soft[alignment_hard == 1], min=1e-12)).sum()
+    return -log_sum / alignment_hard.sum()
+
+
 class ForwardTTSLoss(nn.Module):
     """Generic configurable ForwardTTS loss."""
 
@@ -789,7 +814,7 @@ def __init__(self, c):
         elif c.spec_loss_type == "l1":
             self.spec_loss = L1LossMasked(False)
         else:
-            raise ValueError(" [!] Unknown spec_loss_type {}".format(c.spec_loss_type))
+            raise ValueError(f" [!] Unknown spec_loss_type {c.spec_loss_type}")
 
         if c.duration_loss_type == "mse":
             self.dur_loss = MSELossMasked(False)
@@ -798,7 +823,7 @@ def __init__(self, c):
         elif c.duration_loss_type == "huber":
             self.dur_loss = Huber()
         else:
-            raise ValueError(" [!] Unknown duration_loss_type {}".format(c.duration_loss_type))
+            raise ValueError(f" [!] Unknown duration_loss_type {c.duration_loss_type}")
 
         if c.model_args.use_aligner:
             self.aligner_loss = ForwardSumLoss()
@@ -820,14 +845,6 @@ def __init__(self, c):
         self.dur_loss_alpha = c.dur_loss_alpha
         self.binary_alignment_loss_alpha = c.binary_align_loss_alpha
 
-    @staticmethod
-    def _binary_alignment_loss(alignment_hard, alignment_soft):
-        """Binary loss that forces soft alignments to match the hard alignments as
-        explained in `https://arxiv.org/pdf/2108.10447.pdf`.
-        """
-        log_sum = torch.log(torch.clamp(alignment_soft[alignment_hard == 1], min=1e-12)).sum()
-        return -log_sum / alignment_hard.sum()
-
     def forward(
         self,
         decoder_output,
@@ -879,7 +896,7 @@ def forward(
             return_dict["loss_aligner"] = self.aligner_loss_alpha * aligner_loss
 
         if self.binary_alignment_loss_alpha > 0 and alignment_hard is not None:
-            binary_alignment_loss = self._binary_alignment_loss(alignment_hard, alignment_soft)
+            binary_alignment_loss = _binary_alignment_loss(alignment_hard, alignment_soft)
             loss = loss + self.binary_alignment_loss_alpha * binary_alignment_loss
             if binary_loss_weight:
                 return_dict["loss_binary_alignment"] = (
diff --git a/TTS/tts/layers/overflow/common_layers.py b/TTS/tts/layers/overflow/common_layers.py
index 9f77af293c..a477b34f0b 100644
--- a/TTS/tts/layers/overflow/common_layers.py
+++ b/TTS/tts/layers/overflow/common_layers.py
@@ -1,5 +1,4 @@
 import logging
-from typing import List, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -44,7 +43,7 @@ def __init__(self, num_chars, state_per_phone, in_out_channels=512, n_convolutio
         )
         self.rnn_state = None
 
-    def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> Tuple[torch.FloatTensor, torch.LongTensor]:
+    def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> tuple[torch.FloatTensor, torch.LongTensor]:
         """Forward pass to the encoder.
 
         Args:
@@ -110,7 +109,7 @@ class ParameterModel(nn.Module):
 
     def __init__(
         self,
-        outputnet_size: List[int],
+        outputnet_size: list[int],
         input_size: int,
         output_size: int,
         frame_channels: int,
@@ -152,7 +151,7 @@ def __init__(
         encoder_dim: int,
         memory_rnn_dim: int,
         frame_channels: int,
-        outputnet_size: List[int],
+        outputnet_size: list[int],
         flat_start_params: dict,
         std_floor: float = 1e-2,
     ):
diff --git a/TTS/tts/layers/overflow/neural_hmm.py b/TTS/tts/layers/overflow/neural_hmm.py
index a12becef03..9142f65e8c 100644
--- a/TTS/tts/layers/overflow/neural_hmm.py
+++ b/TTS/tts/layers/overflow/neural_hmm.py
@@ -1,5 +1,3 @@
-from typing import List
-
 import torch
 import torch.distributions as tdist
 import torch.nn.functional as F
@@ -57,7 +55,7 @@ def __init__(
         prenet_dropout: float,
         prenet_dropout_at_inference: bool,
         memory_rnn_dim: int,
-        outputnet_size: List[int],
+        outputnet_size: list[int],
         flat_start_params: dict,
         std_floor: float,
         use_grad_checkpointing: bool = True,
diff --git a/TTS/tts/layers/tacotron/capacitron_layers.py b/TTS/tts/layers/tacotron/capacitron_layers.py
index 2181ffa7ec..817f42771b 100644
--- a/TTS/tts/layers/tacotron/capacitron_layers.py
+++ b/TTS/tts/layers/tacotron/capacitron_layers.py
@@ -3,6 +3,8 @@
 from torch.distributions.multivariate_normal import MultivariateNormal as MVN
 from torch.nn import functional as F
 
+from TTS.tts.layers.tacotron.common_layers import calculate_post_conv_height
+
 
 class CapacitronVAE(nn.Module):
     """Effective Use of Variational Embedding Capacity for prosody transfer.
@@ -97,7 +99,7 @@ def __init__(self, num_mel, out_dim):
         self.training = False
         self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
 
-        post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 2, num_layers)
+        post_conv_height = calculate_post_conv_height(num_mel, 3, 2, 2, num_layers)
         self.recurrence = nn.LSTM(
             input_size=filters[-1] * post_conv_height, hidden_size=out_dim, batch_first=True, bidirectional=False
         )
@@ -155,13 +157,6 @@ def forward(self, inputs, input_lengths):
 
         return last_output.to(inputs.device)  # [B, 128]
 
-    @staticmethod
-    def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs):
-        """Height of spec after n convolutions with fixed kernel/stride/pad."""
-        for _ in range(n_convs):
-            height = (height - kernel_size + 2 * pad) // stride + 1
-        return height
-
 
 class TextSummary(nn.Module):
     def __init__(self, embedding_dim, encoder_output_dim):
diff --git a/TTS/tts/layers/tacotron/common_layers.py b/TTS/tts/layers/tacotron/common_layers.py
index f78ff1e75f..16e517fdca 100644
--- a/TTS/tts/layers/tacotron/common_layers.py
+++ b/TTS/tts/layers/tacotron/common_layers.py
@@ -3,6 +3,13 @@
 from torch.nn import functional as F
 
 
+def calculate_post_conv_height(height: int, kernel_size: int, stride: int, pad: int, n_convs: int) -> int:
+    """Height of spec after n convolutions with fixed kernel/stride/pad."""
+    for _ in range(n_convs):
+        height = (height - kernel_size + 2 * pad) // stride + 1
+    return height
+
+
 class Linear(nn.Module):
     """Linear layer with a specific initialization.
 
diff --git a/TTS/tts/layers/tacotron/gst_layers.py b/TTS/tts/layers/tacotron/gst_layers.py
index 05dba7084f..4a83fb1c83 100644
--- a/TTS/tts/layers/tacotron/gst_layers.py
+++ b/TTS/tts/layers/tacotron/gst_layers.py
@@ -2,6 +2,8 @@
 import torch.nn.functional as F
 from torch import nn
 
+from TTS.tts.layers.tacotron.common_layers import calculate_post_conv_height
+
 
 class GST(nn.Module):
     """Global Style Token Module for factorizing prosody in speech.
@@ -44,7 +46,7 @@ def __init__(self, num_mel, embedding_dim):
         self.convs = nn.ModuleList(convs)
         self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
 
-        post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 1, num_layers)
+        post_conv_height = calculate_post_conv_height(num_mel, 3, 2, 1, num_layers)
         self.recurrence = nn.GRU(
             input_size=filters[-1] * post_conv_height, hidden_size=embedding_dim // 2, batch_first=True
         )
@@ -71,13 +73,6 @@ def forward(self, inputs):
 
         return out.squeeze(0)
 
-    @staticmethod
-    def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs):
-        """Height of spec after n convolutions with fixed kernel/stride/pad."""
-        for _ in range(n_convs):
-            height = (height - kernel_size + 2 * pad) // stride + 1
-        return height
-
 
 class StyleTokenLayer(nn.Module):
     """NN Module attending to style tokens based on prosody encodings."""
@@ -117,7 +112,7 @@ class MultiHeadAttention(nn.Module):
         out --- [N, T_q, num_units]
     """
 
-    def __init__(self, query_dim, key_dim, num_units, num_heads):
+    def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int):
         super().__init__()
         self.num_units = num_units
         self.num_heads = num_heads
@@ -127,7 +122,7 @@ def __init__(self, query_dim, key_dim, num_units, num_heads):
         self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
         self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
 
-    def forward(self, query, key):
+    def forward(self, query: torch.Tensor, key: torch.Tensor) -> torch.Tensor:
         queries = self.W_query(query)  # [N, T_q, num_units]
         keys = self.W_key(key)  # [N, T_k, num_units]
         values = self.W_value(key)
@@ -137,13 +132,11 @@ def forward(self, query, key):
         keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
         values = torch.stack(torch.split(values, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
 
-        # score = softmax(QK^T / (d_k**0.5))
+        # score = softmax(QK^T / (d_k ** 0.5))
         scores = torch.matmul(queries, keys.transpose(2, 3))  # [h, N, T_q, T_k]
         scores = scores / (self.key_dim**0.5)
         scores = F.softmax(scores, dim=3)
 
         # out = score * V
         out = torch.matmul(scores, values)  # [h, N, T_q, num_units/h]
-        out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0)  # [N, T_q, num_units]
-
-        return out
+        return torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0)  # [N, T_q, num_units]
diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py
index 32643dfcee..6f33edf3d7 100644
--- a/TTS/tts/layers/tacotron/tacotron.py
+++ b/TTS/tts/layers/tacotron/tacotron.py
@@ -1,4 +1,3 @@
-# coding: utf-8
 # adapted from https://github.com/r9y9/tacotron_pytorch
 
 import logging
diff --git a/TTS/tts/layers/tortoise/arch_utils.py b/TTS/tts/layers/tortoise/arch_utils.py
index 8eda251f93..508699fee3 100644
--- a/TTS/tts/layers/tortoise/arch_utils.py
+++ b/TTS/tts/layers/tortoise/arch_utils.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio
-from transformers import LogitsWarper
+from transformers import LogitsProcessor
 
 from TTS.tts.layers.tortoise.xtransformers import ContinuousTransformerWrapper, RelativePositionBias
 from TTS.utils.generic_utils import is_pytorch_at_least_2_4
@@ -70,11 +70,10 @@ def forward(self, qkv, mask=None, rel_pos=None):
             weight = rel_pos(weight.reshape(bs, self.n_heads, weight.shape[-2], weight.shape[-1])).reshape(
                 bs * self.n_heads, weight.shape[-2], weight.shape[-1]
             )
-        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
         if mask is not None:
-            # The proper way to do this is to mask before the softmax using -inf, but that doesn't work properly on CPUs.
-            mask = mask.repeat(self.n_heads, 1).unsqueeze(1)
-            weight = weight * mask
+            mask = mask.repeat(self.n_heads, 1, 1)
+            weight[mask.logical_not()] = -torch.inf
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
         a = torch.einsum("bts,bcs->bct", weight, v)
 
         return a.reshape(bs, -1, length)
@@ -93,23 +92,24 @@ def __init__(
         channels,
         num_heads=1,
         num_head_channels=-1,
-        do_checkpoint=True,
+        *,
         relative_pos_embeddings=False,
+        tortoise_norm=False,
     ):
         super().__init__()
         self.channels = channels
-        self.do_checkpoint = do_checkpoint
         if num_head_channels == -1:
             self.num_heads = num_heads
         else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            assert channels % num_head_channels == 0, (
+                f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            )
             self.num_heads = channels // num_head_channels
         self.norm = normalization(channels)
         self.qkv = nn.Conv1d(channels, channels * 3, 1)
         # split heads before split qkv
         self.attention = QKVAttentionLegacy(self.num_heads)
+        self.tortoise_norm = tortoise_norm
 
         self.proj_out = zero_module(nn.Conv1d(channels, channels, 1))
         if relative_pos_embeddings:
@@ -126,10 +126,13 @@ def __init__(
     def forward(self, x, mask=None):
         b, c, *spatial = x.shape
         x = x.reshape(b, c, -1)
-        qkv = self.qkv(self.norm(x))
+        x_norm = self.norm(x)
+        qkv = self.qkv(x_norm)
         h = self.attention(qkv, mask, self.relative_pos_embeddings)
         h = self.proj_out(h)
-        return (x + h).reshape(b, c, *spatial)
+        if self.tortoise_norm:
+            return (x + h).reshape(b, c, *spatial)
+        return (x_norm + h).reshape(b, c, *spatial)
 
 
 class Upsample(nn.Module):
@@ -185,114 +188,6 @@ def forward(self, x):
         return self.op(x)
 
 
-class ResBlock(nn.Module):
-    def __init__(
-        self,
-        channels,
-        dropout,
-        out_channels=None,
-        use_conv=False,
-        use_scale_shift_norm=False,
-        up=False,
-        down=False,
-        kernel_size=3,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.dropout = dropout
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_scale_shift_norm = use_scale_shift_norm
-        padding = 1 if kernel_size == 3 else 2
-
-        self.in_layers = nn.Sequential(
-            normalization(channels),
-            nn.SiLU(),
-            nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding),
-        )
-
-        self.updown = up or down
-
-        if up:
-            self.h_upd = Upsample(channels, False)
-            self.x_upd = Upsample(channels, False)
-        elif down:
-            self.h_upd = Downsample(channels, False)
-            self.x_upd = Downsample(channels, False)
-        else:
-            self.h_upd = self.x_upd = nn.Identity()
-
-        self.out_layers = nn.Sequential(
-            normalization(self.out_channels),
-            nn.SiLU(),
-            nn.Dropout(p=dropout),
-            zero_module(nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding)),
-        )
-
-        if self.out_channels == channels:
-            self.skip_connection = nn.Identity()
-        elif use_conv:
-            self.skip_connection = nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding)
-        else:
-            self.skip_connection = nn.Conv1d(channels, self.out_channels, 1)
-
-    def forward(self, x):
-        if self.updown:
-            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
-            h = in_rest(x)
-            h = self.h_upd(h)
-            x = self.x_upd(x)
-            h = in_conv(h)
-        else:
-            h = self.in_layers(x)
-        h = self.out_layers(h)
-        return self.skip_connection(x) + h
-
-
-class AudioMiniEncoder(nn.Module):
-    def __init__(
-        self,
-        spec_dim,
-        embedding_dim,
-        base_channels=128,
-        depth=2,
-        resnet_blocks=2,
-        attn_blocks=4,
-        num_attn_heads=4,
-        dropout=0,
-        downsample_factor=2,
-        kernel_size=3,
-    ):
-        super().__init__()
-        self.init = nn.Sequential(nn.Conv1d(spec_dim, base_channels, 3, padding=1))
-        ch = base_channels
-        res = []
-        for l in range(depth):
-            for r in range(resnet_blocks):
-                res.append(ResBlock(ch, dropout, kernel_size=kernel_size))
-            res.append(Downsample(ch, use_conv=True, out_channels=ch * 2, factor=downsample_factor))
-            ch *= 2
-        self.res = nn.Sequential(*res)
-        self.final = nn.Sequential(normalization(ch), nn.SiLU(), nn.Conv1d(ch, embedding_dim, 1))
-        attn = []
-        for a in range(attn_blocks):
-            attn.append(
-                AttentionBlock(
-                    embedding_dim,
-                    num_attn_heads,
-                )
-            )
-        self.attn = nn.Sequential(*attn)
-        self.dim = embedding_dim
-
-    def forward(self, x):
-        h = self.init(x)
-        h = self.res(h)
-        h = self.final(h)
-        h = self.attn(h)
-        return h[:, :, 0]
-
-
 DEFAULT_MEL_NORM_FILE = "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/mel_norms.pth"
 
 
@@ -397,7 +292,7 @@ def forward(self, x, **kwargs):
         return h
 
 
-class TypicalLogitsWarper(LogitsWarper):
+class TypicalLogitsWarper(LogitsProcessor):
     def __init__(
         self,
         mass: float = 0.9,
diff --git a/TTS/tts/layers/tortoise/audio_utils.py b/TTS/tts/layers/tortoise/audio_utils.py
index 4f299a8fd9..6bbe6c389c 100644
--- a/TTS/tts/layers/tortoise/audio_utils.py
+++ b/TTS/tts/layers/tortoise/audio_utils.py
@@ -1,7 +1,6 @@
 import logging
 import os
 from glob import glob
-from typing import Dict, List
 
 import librosa
 import numpy as np
@@ -9,7 +8,7 @@
 import torchaudio
 from scipy.io.wavfile import read
 
-from TTS.utils.audio.torch_transforms import TorchSTFT
+from TTS.utils.audio.torch_transforms import TorchSTFT, amp_to_db
 from TTS.utils.generic_utils import is_pytorch_at_least_2_4
 
 logger = logging.getLogger(__name__)
@@ -88,27 +87,9 @@ def normalize_tacotron_mel(mel):
     return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1
 
 
-def dynamic_range_compression(x, C=1, clip_val=1e-5):
-    """
-    PARAMS
-    ------
-    C: compression factor
-    """
-    return torch.log(torch.clamp(x, min=clip_val) * C)
-
-
-def dynamic_range_decompression(x, C=1):
-    """
-    PARAMS
-    ------
-    C: compression factor used to compress
-    """
-    return torch.exp(x) / C
-
-
-def get_voices(extra_voice_dirs: List[str] = []):
+def get_voices(extra_voice_dirs: list[str] = []):
     dirs = extra_voice_dirs
-    voices: Dict[str, List[str]] = {}
+    voices: dict[str, list[str]] = {}
     for d in dirs:
         subs = os.listdir(d)
         for sub in subs:
@@ -118,7 +99,7 @@ def get_voices(extra_voice_dirs: List[str] = []):
     return voices
 
 
-def load_voice(voice: str, extra_voice_dirs: List[str] = []):
+def load_voice(voice: str, extra_voice_dirs: list[str] = []):
     if voice == "random":
         return None, None
 
@@ -134,7 +115,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []):
         return conds, None
 
 
-def load_voices(voices: List[str], extra_voice_dirs: List[str] = []):
+def load_voices(voices: list[str], extra_voice_dirs: list[str] = []):
     latents = []
     clips = []
     for voice in voices:
@@ -144,14 +125,14 @@ def load_voices(voices: List[str], extra_voice_dirs: List[str] = []):
             return None, None
         clip, latent = load_voice(voice, extra_voice_dirs)
         if latent is None:
-            assert (
-                len(latents) == 0
-            ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+            assert len(latents) == 0, (
+                "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+            )
             clips.extend(clip)
         elif clip is None:
-            assert (
-                len(clips) == 0
-            ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+            assert len(clips) == 0, (
+                "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+            )
             latents.append(latent)
     if len(latents) == 0:
         return clips, None
@@ -175,7 +156,7 @@ def wav_to_univnet_mel(wav, do_normalization=False, device="cuda"):
     )
     stft = stft.to(device)
     mel = stft(wav)
-    mel = dynamic_range_compression(mel)
+    mel = amp_to_db(mel)
     if do_normalization:
         mel = normalize_tacotron_mel(mel)
     return mel
diff --git a/TTS/tts/layers/tortoise/autoregressive.py b/TTS/tts/layers/tortoise/autoregressive.py
index aaae695516..eaeb2a03c1 100644
--- a/TTS/tts/layers/tortoise/autoregressive.py
+++ b/TTS/tts/layers/tortoise/autoregressive.py
@@ -1,6 +1,6 @@
 # AGPL: a notification must be added stating that changes have been made to that file.
 import functools
-from typing import Optional
+import random
 
 import torch
 import torch.nn as nn
@@ -123,7 +123,7 @@ def forward(
         else:
             emb = self.embeddings(input_ids)
             emb = emb + self.text_pos_embedding.get_fixed_embedding(
-                attention_mask.shape[1] - mel_len, attention_mask.device
+                attention_mask.shape[1] - (mel_len + 1), attention_mask.device
             )
 
         transformer_outputs = self.transformer(
@@ -175,44 +175,56 @@ def __init__(
         embedding_dim,
         attn_blocks=6,
         num_attn_heads=4,
-        do_checkpointing=False,
-        mean=False,
+        *,
+        tortoise_norm=False,
     ):
         super().__init__()
         attn = []
         self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1)
         for a in range(attn_blocks):
-            attn.append(AttentionBlock(embedding_dim, num_attn_heads))
+            attn.append(AttentionBlock(embedding_dim, num_attn_heads, tortoise_norm=tortoise_norm))
         self.attn = nn.Sequential(*attn)
         self.dim = embedding_dim
-        self.do_checkpointing = do_checkpointing
-        self.mean = mean
 
     def forward(self, x):
+        """
+        x: (b, 80, s)
+        """
         h = self.init(x)
         h = self.attn(h)
-        if self.mean:
-            return h.mean(dim=2)
-        else:
-            return h[:, :, 0]
+        return h
 
 
 class LearnedPositionEmbeddings(nn.Module):
-    def __init__(self, seq_len, model_dim, init=0.02):
+    def __init__(self, seq_len, model_dim, init=0.02, relative=False):
         super().__init__()
         self.emb = nn.Embedding(seq_len, model_dim)
         # Initializing this way is standard for GPT-2
         self.emb.weight.data.normal_(mean=0.0, std=init)
+        self.relative = relative
+        self.seq_len = seq_len
 
     def forward(self, x):
         sl = x.shape[1]
-        return self.emb(torch.arange(0, sl, device=x.device))
+        if self.relative:
+            start = random.randint(sl, self.seq_len) - sl
+            return self.emb(torch.arange(start, start + sl, device=x.device))
+        else:
+            return self.emb(torch.arange(0, sl, device=x.device))
 
     def get_fixed_embedding(self, ind, dev):
-        return self.emb(torch.arange(0, ind, device=dev))[ind - 1 : ind]
-
-
-def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing):
+        return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0)
+
+
+def build_hf_gpt_transformer(
+    layers: int,
+    model_dim: int,
+    heads: int,
+    max_mel_seq_len: int,
+    max_text_seq_len: int,
+    checkpointing: bool,
+    max_prompt_len: int = 0,
+):
     """
     GPT-2 implemented by the HuggingFace library.
     """
@@ -220,8 +232,8 @@ def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text
 
     gpt_config = GPT2Config(
         vocab_size=256,  # Unused.
-        n_positions=max_mel_seq_len + max_text_seq_len,
-        n_ctx=max_mel_seq_len + max_text_seq_len,
+        n_positions=max_mel_seq_len + max_text_seq_len + max_prompt_len,
+        n_ctx=max_mel_seq_len + max_text_seq_len + max_prompt_len,
         n_embd=model_dim,
         n_layer=layers,
         n_head=heads,
@@ -234,13 +246,18 @@ def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text
     gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim)
     # Built-in token embeddings are unused.
     del gpt.wte
-    return (
-        gpt,
-        LearnedPositionEmbeddings(max_mel_seq_len, model_dim),
-        LearnedPositionEmbeddings(max_text_seq_len, model_dim),
-        None,
-        None,
+
+    mel_pos_emb = (
+        LearnedPositionEmbeddings(max_mel_seq_len, model_dim)
+        if max_mel_seq_len != -1
+        else functools.partial(null_position_embeddings, dim=model_dim)
     )
+    text_pos_emb = (
+        LearnedPositionEmbeddings(max_text_seq_len, model_dim)
+        if max_mel_seq_len != -1
+        else functools.partial(null_position_embeddings, dim=model_dim)
+    )
+    return gpt, mel_pos_emb, text_pos_emb, None, None
 
 
 class MelEncoder(nn.Module):
@@ -334,12 +351,12 @@ def __init__(
             self.mel_layer_pos_embedding,
             self.text_layer_pos_embedding,
         ) = build_hf_gpt_transformer(
-            layers,
-            model_dim,
-            heads,
-            self.max_mel_tokens + 2 + self.max_conditioning_inputs,
-            self.max_text_tokens + 2,
-            checkpointing,
+            layers=layers,
+            model_dim=model_dim,
+            heads=heads,
+            max_mel_seq_len=self.max_mel_tokens + 2 + self.max_conditioning_inputs,
+            max_text_seq_len=self.max_text_tokens + 2,
+            checkpointing=checkpointing,
         )
         if train_solo_embeddings:
             self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * 0.02, requires_grad=True)
@@ -455,7 +472,7 @@ def get_conditioning(self, speech_conditioning_input):
         )
         conds = []
         for j in range(speech_conditioning_input.shape[1]):
-            conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
+            conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])[:, :, 0])
         conds = torch.stack(conds, dim=1)
         conds = conds.mean(dim=1)
         return conds
@@ -591,9 +608,9 @@ def inference_speech(
         if input_tokens is None:
             inputs = fake_inputs
         else:
-            assert (
-                num_return_sequences % input_tokens.shape[0] == 0
-            ), "The number of return sequences must be divisible by the number of input sequences"
+            assert num_return_sequences % input_tokens.shape[0] == 0, (
+                "The number of return sequences must be divisible by the number of input sequences"
+            )
             fake_inputs = fake_inputs.repeat(num_return_sequences, 1)
             input_tokens = input_tokens.repeat(num_return_sequences // input_tokens.shape[0], 1)
             inputs = torch.cat([fake_inputs, input_tokens], dim=1)
@@ -622,8 +639,8 @@ def inference_speech(
 
 def _prepare_attention_mask_for_generation(
     inputs: torch.Tensor,
-    pad_token_id: Optional[torch.Tensor],
-    eos_token_id: Optional[torch.Tensor],
+    pad_token_id: torch.Tensor | None,
+    eos_token_id: torch.Tensor | None,
 ) -> torch.LongTensor:
     # No information for attention mask inference -> return default attention mask
     default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device)
diff --git a/TTS/tts/layers/tortoise/classifier.py b/TTS/tts/layers/tortoise/classifier.py
index 8764bb070b..337323db67 100644
--- a/TTS/tts/layers/tortoise/classifier.py
+++ b/TTS/tts/layers/tortoise/classifier.py
@@ -16,7 +16,6 @@ def __init__(
         up=False,
         down=False,
         kernel_size=3,
-        do_checkpoint=True,
     ):
         super().__init__()
         self.channels = channels
@@ -24,7 +23,6 @@ def __init__(
         self.out_channels = out_channels or channels
         self.use_conv = use_conv
         self.use_scale_shift_norm = use_scale_shift_norm
-        self.do_checkpoint = do_checkpoint
         padding = 1 if kernel_size == 3 else 2
 
         self.in_layers = nn.Sequential(
@@ -92,14 +90,14 @@ def __init__(
         self.layers = depth
         for l in range(depth):
             for r in range(resnet_blocks):
-                res.append(ResBlock(ch, dropout, do_checkpoint=False, kernel_size=kernel_size))
+                res.append(ResBlock(ch, dropout, kernel_size=kernel_size))
             res.append(Downsample(ch, use_conv=True, out_channels=ch * 2, factor=downsample_factor))
             ch *= 2
         self.res = nn.Sequential(*res)
         self.final = nn.Sequential(normalization(ch), nn.SiLU(), nn.Conv1d(ch, embedding_dim, 1))
         attn = []
         for a in range(attn_blocks):
-            attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False))
+            attn.append(AttentionBlock(embedding_dim, num_attn_heads, tortoise_norm=True))
         self.attn = nn.Sequential(*attn)
         self.dim = embedding_dim
 
diff --git a/TTS/tts/layers/tortoise/clvp.py b/TTS/tts/layers/tortoise/clvp.py
index 241dfdd4f4..44da1324e7 100644
--- a/TTS/tts/layers/tortoise/clvp.py
+++ b/TTS/tts/layers/tortoise/clvp.py
@@ -8,10 +8,6 @@
 from TTS.tts.layers.tortoise.xtransformers import Encoder
 
 
-def exists(val):
-    return val is not None
-
-
 def masked_mean(t, mask, dim=1):
     t = t.masked_fill(~mask[:, :, None], 0.0)
     return t.sum(dim=1) / mask.sum(dim=1)[..., None]
diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py
index 2b29091b44..cfb8fa800d 100644
--- a/TTS/tts/layers/tortoise/diffusion.py
+++ b/TTS/tts/layers/tortoise/diffusion.py
@@ -653,7 +653,7 @@ def p_sample_loop_progressive(
         """
         if device is None:
             device = next(model.parameters()).device
-        assert isinstance(shape, (tuple, list))
+        assert isinstance(shape, tuple | list)
         if noise is not None:
             img = noise
         else:
@@ -805,7 +805,7 @@ def ddim_sample_loop_progressive(
         """
         if device is None:
             device = next(model.parameters()).device
-        assert isinstance(shape, (tuple, list))
+        assert isinstance(shape, tuple | list)
         if noise is not None:
             img = noise
         else:
diff --git a/TTS/tts/layers/tortoise/diffusion_decoder.py b/TTS/tts/layers/tortoise/diffusion_decoder.py
index f71eaf1718..cfdeaff8bb 100644
--- a/TTS/tts/layers/tortoise/diffusion_decoder.py
+++ b/TTS/tts/layers/tortoise/diffusion_decoder.py
@@ -130,7 +130,7 @@ def __init__(self, model_channels, dropout, num_heads):
             dims=1,
             use_scale_shift_norm=True,
         )
-        self.attn = AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True)
+        self.attn = AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True)
 
     def forward(self, x, time_emb):
         y = self.resblk(x, time_emb)
@@ -177,17 +177,17 @@ def __init__(
         # transformer network.
         self.code_embedding = nn.Embedding(in_tokens, model_channels)
         self.code_converter = nn.Sequential(
-            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
-            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
-            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True),
         )
         self.code_norm = normalization(model_channels)
         self.latent_conditioner = nn.Sequential(
             nn.Conv1d(in_latent_channels, model_channels, 3, padding=1),
-            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
-            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
-            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
-            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True),
         )
         self.contextual_embedder = nn.Sequential(
             nn.Conv1d(in_channels, model_channels, 3, padding=1, stride=2),
@@ -196,31 +196,31 @@ def __init__(
                 model_channels * 2,
                 num_heads,
                 relative_pos_embeddings=True,
-                do_checkpoint=False,
+                tortoise_norm=True,
             ),
             AttentionBlock(
                 model_channels * 2,
                 num_heads,
                 relative_pos_embeddings=True,
-                do_checkpoint=False,
+                tortoise_norm=True,
             ),
             AttentionBlock(
                 model_channels * 2,
                 num_heads,
                 relative_pos_embeddings=True,
-                do_checkpoint=False,
+                tortoise_norm=True,
             ),
             AttentionBlock(
                 model_channels * 2,
                 num_heads,
                 relative_pos_embeddings=True,
-                do_checkpoint=False,
+                tortoise_norm=True,
             ),
             AttentionBlock(
                 model_channels * 2,
                 num_heads,
                 relative_pos_embeddings=True,
-                do_checkpoint=False,
+                tortoise_norm=True,
             ),
         )
         self.unconditioned_embedding = nn.Parameter(torch.randn(1, model_channels, 1))
diff --git a/TTS/tts/layers/tortoise/dpm_solver.py b/TTS/tts/layers/tortoise/dpm_solver.py
index 6a1d8ff784..c8892d456a 100644
--- a/TTS/tts/layers/tortoise/dpm_solver.py
+++ b/TTS/tts/layers/tortoise/dpm_solver.py
@@ -98,9 +98,7 @@ def __init__(
 
         if schedule not in ["discrete", "linear", "cosine"]:
             raise ValueError(
-                "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(
-                    schedule
-                )
+                f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear' or 'cosine'"
             )
 
         self.schedule = schedule
@@ -150,7 +148,7 @@ def marginal_log_mean_coeff(self, t):
                 t.reshape((-1, 1)),
                 self.t_array.to(t.device),
                 self.log_alpha_array.to(t.device),
-            ).reshape((-1))
+            ).reshape(-1)
         elif self.schedule == "linear":
             return -0.25 * t**2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
         elif self.schedule == "cosine":
@@ -447,7 +445,7 @@ def correcting_xt_fn(xt, t, step):
             Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models
             with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
         """
-        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+        self.model = lambda x, t: model_fn(x, t.expand(x.shape[0]))
         self.noise_schedule = noise_schedule
         assert algorithm_type in ["dpmsolver", "dpmsolver++"]
         self.algorithm_type = algorithm_type
@@ -527,7 +525,7 @@ def get_time_steps(self, skip_type, t_T, t_0, N, device):
             return t
         else:
             raise ValueError(
-                "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type)
+                f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'"
             )
 
     def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
@@ -565,41 +563,21 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type
         if order == 3:
             K = steps // 3 + 1
             if steps % 3 == 0:
-                orders = [
-                    3,
-                ] * (
-                    K - 2
-                ) + [2, 1]
+                orders = [3] * (K - 2) + [2, 1]
             elif steps % 3 == 1:
-                orders = [
-                    3,
-                ] * (
-                    K - 1
-                ) + [1]
+                orders = [3] * (K - 1) + [1]
             else:
-                orders = [
-                    3,
-                ] * (
-                    K - 1
-                ) + [2]
+                orders = [3] * (K - 1) + [2]
         elif order == 2:
             if steps % 2 == 0:
                 K = steps // 2
-                orders = [
-                    2,
-                ] * K
+                orders = [2] * K
             else:
                 K = steps // 2 + 1
-                orders = [
-                    2,
-                ] * (
-                    K - 1
-                ) + [1]
+                orders = [2] * (K - 1) + [1]
         elif order == 1:
             K = 1
-            orders = [
-                1,
-            ] * steps
+            orders = [1] * steps
         else:
             raise ValueError("'order' must be '1' or '2' or '3'.")
         if skip_type == "logSNR":
@@ -607,15 +585,7 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type
             timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
         else:
             timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
-                torch.cumsum(
-                    torch.tensor(
-                        [
-                            0,
-                        ]
-                        + orders
-                    ),
-                    0,
-                ).to(device)
+                torch.cumsum(torch.tensor([0] + orders), 0).to(device)
             ]
         return timesteps_outer, orders
 
@@ -693,7 +663,7 @@ def singlestep_dpm_solver_second_update(
             x_t: A pytorch tensor. The approximated solution at time `t`.
         """
         if solver_type not in ["dpmsolver", "taylor"]:
-            raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
+            raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}")
         if r1 is None:
             r1 = 0.5
         ns = self.noise_schedule
@@ -790,7 +760,7 @@ def singlestep_dpm_solver_third_update(
             x_t: A pytorch tensor. The approximated solution at time `t`.
         """
         if solver_type not in ["dpmsolver", "taylor"]:
-            raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
+            raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}")
         if r1 is None:
             r1 = 1.0 / 3.0
         if r2 is None:
@@ -913,7 +883,7 @@ def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t,
             x_t: A pytorch tensor. The approximated solution at time `t`.
         """
         if solver_type not in ["dpmsolver", "taylor"]:
-            raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
+            raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}")
         ns = self.noise_schedule
         model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1]
         t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1]
@@ -1062,7 +1032,7 @@ def singlestep_dpm_solver_update(
                 r2=r2,
             )
         else:
-            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
+            raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}")
 
     def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type="dpmsolver"):
         """
@@ -1086,7 +1056,7 @@ def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order,
         elif order == 3:
             return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
         else:
-            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
+            raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}")
 
     def dpm_solver_adaptive(
         self,
@@ -1150,8 +1120,8 @@ def higher_update(x, s, t, **kwargs):
                 return self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
 
         else:
-            raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
-        while torch.abs((s - t_0)).mean() > t_err:
+            raise ValueError(f"For adaptive step size solver, order must be 2 or 3, got {order}")
+        while torch.abs(s - t_0).mean() > t_err:
             t = ns.inverse_lambda(lambda_s + h)
             x_lower, lower_noise_kwargs = lower_update(x, s, t)
             x_higher = higher_update(x, s, t, **lower_noise_kwargs)
@@ -1219,9 +1189,9 @@ def inverse(
         """
         t_0 = 1.0 / self.noise_schedule.total_N if t_start is None else t_start
         t_T = self.noise_schedule.T if t_end is None else t_end
-        assert (
-            t_0 > 0 and t_T > 0
-        ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        assert t_0 > 0 and t_T > 0, (
+            "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        )
         return self.sample(
             x,
             steps=steps,
@@ -1364,9 +1334,9 @@ def sample(
         """
         t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end
         t_T = self.noise_schedule.T if t_start is None else t_start
-        assert (
-            t_0 > 0 and t_T > 0
-        ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        assert t_0 > 0 and t_T > 0, (
+            "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        )
         if return_intermediate:
             assert method in [
                 "multistep",
@@ -1487,7 +1457,7 @@ def sample(
                     if return_intermediate:
                         intermediates.append(x)
             else:
-                raise ValueError("Got wrong method {}".format(method))
+                raise ValueError(f"Got wrong method {method}")
             if denoise_to_zero:
                 t = torch.ones((1,)).to(device) * t_0
                 x = self.denoise_to_zero_fn(x, t)
diff --git a/TTS/tts/layers/tortoise/transformer.py b/TTS/tts/layers/tortoise/transformer.py
index 6cb1bab96a..531f294220 100644
--- a/TTS/tts/layers/tortoise/transformer.py
+++ b/TTS/tts/layers/tortoise/transformer.py
@@ -1,22 +1,19 @@
+from typing import TypeVar
+
 import torch
 import torch.nn.functional as F
 from einops import rearrange
 from torch import nn
 
-# helpers
-
-
-def exists(val):
-    return val is not None
+from TTS.utils.generic_utils import exists
 
-
-def default(val, d):
-    return val if exists(val) else d
+# helpers
+_T = TypeVar("_T")
 
 
-def cast_tuple(val, depth=1):
+def cast_tuple(val: tuple[_T] | list[_T] | _T, depth: int = 1) -> tuple[_T]:
     if isinstance(val, list):
-        val = tuple(val)
+        return tuple(val)
     return val if isinstance(val, tuple) else (val,) * depth
 
 
@@ -46,9 +43,9 @@ def route_args(router, args, depth):
 class SequentialSequence(nn.Module):
     def __init__(self, layers, args_route={}, layer_dropout=0.0):
         super().__init__()
-        assert all(
-            len(route) == len(layers) for route in args_route.values()
-        ), "each argument route map must have the same depth as the number of sequential layers"
+        assert all(len(route) == len(layers) for route in args_route.values()), (
+            "each argument route map must have the same depth as the number of sequential layers"
+        )
         self.layers = layers
         self.args_route = args_route
         self.layer_dropout = layer_dropout
diff --git a/TTS/tts/layers/tortoise/vocoder.py b/TTS/tts/layers/tortoise/vocoder.py
index a5200c2673..e7497d8190 100644
--- a/TTS/tts/layers/tortoise/vocoder.py
+++ b/TTS/tts/layers/tortoise/vocoder.py
@@ -1,6 +1,6 @@
+from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum
-from typing import Callable, Optional
 
 import torch
 import torch.nn as nn
@@ -293,7 +293,7 @@ def __init__(
         hop_length=256,
         n_mel_channels=100,
     ):
-        super(UnivNetGenerator, self).__init__()
+        super().__init__()
         self.mel_channel = n_mel_channels
         self.noise_dim = noise_dim
         self.hop_length = hop_length
@@ -344,7 +344,7 @@ def forward(self, c, z):
         return z
 
     def eval(self, inference=False):
-        super(UnivNetGenerator, self).eval()
+        super().eval()
         # don't remove weight norm while validation in training loop
         if inference:
             self.remove_weight_norm()
@@ -378,7 +378,7 @@ def inference(self, c, z=None):
 class VocType:
     constructor: Callable[[], nn.Module]
     model_path: str
-    subkey: Optional[str] = None
+    subkey: str | None = None
 
     def optionally_index(self, model_dict):
         if self.subkey is not None:
diff --git a/TTS/tts/layers/tortoise/xtransformers.py b/TTS/tts/layers/tortoise/xtransformers.py
index 9325b8c720..b2e74cf118 100644
--- a/TTS/tts/layers/tortoise/xtransformers.py
+++ b/TTS/tts/layers/tortoise/xtransformers.py
@@ -1,13 +1,15 @@
 import math
 from collections import namedtuple
 from functools import partial
-from inspect import isfunction
 
 import torch
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from torch import einsum, nn
 
+from TTS.tts.layers.tortoise.transformer import cast_tuple, max_neg_value
+from TTS.utils.generic_utils import default, exists
+
 DEFAULT_DIM_HEAD = 64
 
 Intermediates = namedtuple("Intermediates", ["pre_softmax_attn", "post_softmax_attn"])
@@ -25,20 +27,6 @@
 # helpers
 
 
-def exists(val):
-    return val is not None
-
-
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if isfunction(d) else d
-
-
-def cast_tuple(val, depth):
-    return val if isinstance(val, tuple) else (val,) * depth
-
-
 class always:
     def __init__(self, val):
         self.val = val
@@ -63,10 +51,6 @@ def __call__(self, x, *args, **kwargs):
         return x == self.val
 
 
-def max_neg_value(tensor):
-    return -torch.finfo(tensor.dtype).max
-
-
 def l2norm(t):
     return F.normalize(t, p=2, dim=-1)
 
@@ -576,9 +560,9 @@ def __init__(
 
         self.rel_pos_bias = rel_pos_bias
         if rel_pos_bias:
-            assert (
-                rel_pos_num_buckets <= rel_pos_max_distance
-            ), "number of relative position buckets must be less than the relative position max distance"
+            assert rel_pos_num_buckets <= rel_pos_max_distance, (
+                "number of relative position buckets must be less than the relative position max distance"
+            )
             self.rel_pos = RelativePositionBias(
                 scale=dim_head**0.5,
                 causal=causal,
@@ -696,9 +680,9 @@ def forward(
             del input_mask
 
         if exists(attn_mask):
-            assert (
-                2 <= attn_mask.ndim <= 4
-            ), "attention mask must have greater than 2 dimensions but less than or equal to 4"
+            assert 2 <= attn_mask.ndim <= 4, (
+                "attention mask must have greater than 2 dimensions but less than or equal to 4"
+            )
             if attn_mask.ndim == 2:
                 attn_mask = rearrange(attn_mask, "i j -> () () i j")
             elif attn_mask.ndim == 3:
@@ -806,9 +790,9 @@ def __init__(
         rotary_emb_dim = max(default(rotary_emb_dim, dim_head // 2), 32)
         self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim) if rotary_pos_emb else None
 
-        assert not (
-            alibi_pos_bias and rel_pos_bias
-        ), "you can only choose Alibi positional bias or T5 relative positional bias, not both"
+        assert not (alibi_pos_bias and rel_pos_bias), (
+            "you can only choose Alibi positional bias or T5 relative positional bias, not both"
+        )
 
         if alibi_pos_bias:
             alibi_num_heads = default(alibi_num_heads, heads)
@@ -938,9 +922,9 @@ def forward(
         past_key_values=None,
         expected_seq_len=None,
     ):
-        assert not (
-            self.cross_attend ^ (exists(context) or exists(full_context))
-        ), "context must be passed in if cross_attend is set to True"
+        assert not (self.cross_attend ^ (exists(context) or exists(full_context))), (
+            "context must be passed in if cross_attend is set to True"
+        )
         assert context is None or full_context is None, "only one of full_context or context can be provided"
 
         hiddens = []
@@ -956,9 +940,9 @@ def forward(
         rotary_pos_emb = None
         if exists(self.rotary_pos_emb):
             if not self.training and self.causal:
-                assert (
-                    expected_seq_len is not None
-                ), "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`"
+                assert expected_seq_len is not None, (
+                    "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`"
+                )
             elif expected_seq_len is None:
                 expected_seq_len = 0
             seq_len = x.shape[1]
diff --git a/TTS/tts/layers/vits/discriminator.py b/TTS/tts/layers/vits/discriminator.py
index 3449739fdc..49f7a0d074 100644
--- a/TTS/tts/layers/vits/discriminator.py
+++ b/TTS/tts/layers/vits/discriminator.py
@@ -2,7 +2,7 @@
 from torch import nn
 from torch.nn.modules.conv import Conv1d
 
-from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP
+from TTS.vocoder.models.hifigan_discriminator import LRELU_SLOPE, DiscriminatorP
 
 
 class DiscriminatorS(torch.nn.Module):
@@ -39,7 +39,7 @@ def forward(self, x):
         feat = []
         for l in self.convs:
             x = l(x)
-            x = torch.nn.functional.leaky_relu(x, 0.1)
+            x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
             feat.append(x)
         x = self.conv_post(x)
         feat.append(x)
diff --git a/TTS/tts/layers/vits/networks.py b/TTS/tts/layers/vits/networks.py
index 50ed1024de..ab2ca5667a 100644
--- a/TTS/tts/layers/vits/networks.py
+++ b/TTS/tts/layers/vits/networks.py
@@ -256,7 +256,7 @@ def __init__(
         )
         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 
-    def forward(self, x, x_lengths, g=None):
+    def forward(self, x, x_lengths, g=None, tau=1.0):
         """
         Shapes:
             - x: :math:`[B, C, T]`
@@ -268,5 +268,5 @@ def forward(self, x, x_lengths, g=None):
         x = self.enc(x, x_mask, g=g)
         stats = self.proj(x) * x_mask
         mean, log_scale = torch.split(stats, self.out_channels, dim=1)
-        z = (mean + torch.randn_like(mean) * torch.exp(log_scale)) * x_mask
+        z = (mean + torch.randn_like(mean) * tau * torch.exp(log_scale)) * x_mask
         return z, mean, log_scale, x_mask
diff --git a/TTS/tts/layers/vits/transforms.py b/TTS/tts/layers/vits/transforms.py
index 3cac1b8d6d..da5deea9ef 100644
--- a/TTS/tts/layers/vits/transforms.py
+++ b/TTS/tts/layers/vits/transforms.py
@@ -74,7 +74,7 @@ def unconstrained_rational_quadratic_spline(
         outputs[outside_interval_mask] = inputs[outside_interval_mask]
         logabsdet[outside_interval_mask] = 0
     else:
-        raise RuntimeError("{} tails are not implemented.".format(tails))
+        raise RuntimeError(f"{tails} tails are not implemented.")
 
     outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
         inputs=inputs[inside_interval_mask],
diff --git a/TTS/tts/layers/xtts/dvae.py b/TTS/tts/layers/xtts/dvae.py
index 73970fb0bf..4f806f82cb 100644
--- a/TTS/tts/layers/xtts/dvae.py
+++ b/TTS/tts/layers/xtts/dvae.py
@@ -14,10 +14,6 @@
 logger = logging.getLogger(__name__)
 
 
-def default(val, d):
-    return val if val is not None else d
-
-
 def eval_decorator(fn):
     def inner(model, *args, **kwargs):
         was_training = model.training
diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py
index b3c3b31b47..4e0f53616d 100644
--- a/TTS/tts/layers/xtts/gpt.py
+++ b/TTS/tts/layers/xtts/gpt.py
@@ -1,6 +1,5 @@
 # ported from: https://github.com/neonbjb/tortoise-tts
 
-import functools
 import random
 
 import torch
@@ -8,83 +7,16 @@
 import torch.nn.functional as F
 from transformers import GPT2Config
 
-from TTS.tts.layers.tortoise.autoregressive import _prepare_attention_mask_for_generation
+from TTS.tts.layers.tortoise.autoregressive import (
+    ConditioningEncoder,
+    LearnedPositionEmbeddings,
+    _prepare_attention_mask_for_generation,
+    build_hf_gpt_transformer,
+)
 from TTS.tts.layers.xtts.gpt_inference import GPT2InferenceModel
-from TTS.tts.layers.xtts.latent_encoder import ConditioningEncoder
 from TTS.tts.layers.xtts.perceiver_encoder import PerceiverResampler
 
 
-def null_position_embeddings(range, dim):
-    return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device)
-
-
-class LearnedPositionEmbeddings(nn.Module):
-    def __init__(self, seq_len, model_dim, init=0.02, relative=False):
-        super().__init__()
-        # nn.Embedding
-        self.emb = torch.nn.Embedding(seq_len, model_dim)
-        # Initializing this way is standard for GPT-2
-        self.emb.weight.data.normal_(mean=0.0, std=init)
-        self.relative = relative
-        self.seq_len = seq_len
-
-    def forward(self, x):
-        sl = x.shape[1]
-        if self.relative:
-            start = random.randint(sl, self.seq_len) - sl
-            return self.emb(torch.arange(start, start + sl, device=x.device))
-        else:
-            return self.emb(torch.arange(0, sl, device=x.device))
-
-    def get_fixed_embedding(self, ind, dev):
-        return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0)
-
-
-def build_hf_gpt_transformer(
-    layers,
-    model_dim,
-    heads,
-    max_mel_seq_len,
-    max_text_seq_len,
-    max_prompt_len,
-    checkpointing,
-):
-    """
-    GPT-2 implemented by the HuggingFace library.
-    """
-    from transformers import GPT2Config, GPT2Model
-
-    gpt_config = GPT2Config(
-        vocab_size=256,  # Unused.
-        n_positions=max_mel_seq_len + max_text_seq_len + max_prompt_len,
-        n_ctx=max_mel_seq_len + max_text_seq_len + max_prompt_len,
-        n_embd=model_dim,
-        n_layer=layers,
-        n_head=heads,
-        gradient_checkpointing=checkpointing,
-        use_cache=not checkpointing,
-    )
-    gpt = GPT2Model(gpt_config)
-    # Override the built in positional embeddings
-    del gpt.wpe
-    gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim)
-    # Built-in token embeddings are unused.
-    del gpt.wte
-
-    mel_pos_emb = (
-        LearnedPositionEmbeddings(max_mel_seq_len, model_dim)
-        if max_mel_seq_len != -1
-        else functools.partial(null_position_embeddings, dim=model_dim)
-    )
-    text_pos_emb = (
-        LearnedPositionEmbeddings(max_text_seq_len, model_dim)
-        if max_mel_seq_len != -1
-        else functools.partial(null_position_embeddings, dim=model_dim)
-    )
-    # gpt = torch.compile(gpt, mode="reduce-overhead", fullgraph=True)
-    return gpt, mel_pos_emb, text_pos_emb, None, None
-
-
 class GPT(nn.Module):
     def __init__(
         self,
@@ -149,13 +81,13 @@ def __init__(
             self.mel_layer_pos_embedding,
             self.text_layer_pos_embedding,
         ) = build_hf_gpt_transformer(
-            layers,
-            model_dim,
-            heads,
-            self.max_mel_tokens,
-            self.max_text_tokens,
-            self.max_prompt_tokens,
-            checkpointing,
+            layers=layers,
+            model_dim=model_dim,
+            heads=heads,
+            max_mel_seq_len=self.max_mel_tokens,
+            max_text_seq_len=self.max_text_tokens,
+            max_prompt_len=self.max_prompt_tokens,
+            checkpointing=checkpointing,
         )
         if train_solo_embeddings:
             self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * 0.02, requires_grad=True)
@@ -303,19 +235,6 @@ def get_logits(
         else:
             return first_logits
 
-    def get_conditioning(self, speech_conditioning_input):
-        speech_conditioning_input = (
-            speech_conditioning_input.unsqueeze(1)
-            if len(speech_conditioning_input.shape) == 3
-            else speech_conditioning_input
-        )
-        conds = []
-        for j in range(speech_conditioning_input.shape[1]):
-            conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
-        conds = torch.stack(conds, dim=1)
-        conds = conds.mean(dim=1)
-        return conds
-
     def get_prompts(self, prompt_codes):
         """
         Create a prompt from the mel codes. This is used to condition the model on the mel codes.
@@ -354,6 +273,7 @@ def get_style_emb(self, cond_input, return_latent=False):
         """
         cond_input: (b, 80, s) or (b, 1, 80, s)
         conds: (b, 1024, s)
+        output: (b, 1024, 32)
         """
         conds = None
         if not return_latent:
@@ -427,12 +347,12 @@ def forward(
             audio_codes = F.pad(audio_codes, (0, max_mel_len - audio_codes.shape[-1]))
 
         # 💖 Lovely assertions
-        assert (
-            max_mel_len <= audio_codes.shape[-1]
-        ), f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})"
-        assert (
-            max_text_len <= text_inputs.shape[-1]
-        ), f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})"
+        assert max_mel_len <= audio_codes.shape[-1], (
+            f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})"
+        )
+        assert max_text_len <= text_inputs.shape[-1], (
+            f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})"
+        )
 
         # Append stop token to text inputs
         text_inputs = F.pad(text_inputs[:, :max_text_len], (0, 1), value=self.stop_text_token)
@@ -534,9 +454,9 @@ def forward(
             mel_targets[idx, l + 1 :] = -1
 
         # check if stoptoken is in every row of mel_targets
-        assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[
-            0
-        ], f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row."
+        assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[0], (
+            f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row."
+        )
 
         # ignore the loss for the segment used for conditioning
         # coin flip for the segment to be ignored
diff --git a/TTS/tts/layers/xtts/hifigan_decoder.py b/TTS/tts/layers/xtts/hifigan_decoder.py
index 5ef0030b8b..550ad3e3b2 100644
--- a/TTS/tts/layers/xtts/hifigan_decoder.py
+++ b/TTS/tts/layers/xtts/hifigan_decoder.py
@@ -1,618 +1,13 @@
 import logging
 
 import torch
-import torchaudio
-from torch import nn
-from torch.nn import Conv1d, ConvTranspose1d
-from torch.nn import functional as F
-from torch.nn.utils.parametrizations import weight_norm
-from torch.nn.utils.parametrize import remove_parametrizations
 from trainer.io import load_fsspec
 
-from TTS.utils.generic_utils import is_pytorch_at_least_2_4
-from TTS.vocoder.models.hifigan_generator import get_padding
+from TTS.encoder.models.resnet import ResNetSpeakerEncoder
+from TTS.vocoder.models.hifigan_generator import HifiganGenerator
 
 logger = logging.getLogger(__name__)
 
-LRELU_SLOPE = 0.1
-
-
-class ResBlock1(torch.nn.Module):
-    """Residual Block Type 1. It has 3 convolutional layers in each convolutional block.
-
-    Network::
-
-        x -> lrelu -> conv1_1 -> conv1_2 -> conv1_3 -> z -> lrelu -> conv2_1 -> conv2_2 -> conv2_3 -> o -> + -> o
-        |--------------------------------------------------------------------------------------------------|
-
-
-    Args:
-        channels (int): number of hidden channels for the convolutional layers.
-        kernel_size (int): size of the convolution filter in each layer.
-        dilations (list): list of dilation value for each conv layer in a block.
-    """
-
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super().__init__()
-        self.convs1 = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[2],
-                        padding=get_padding(kernel_size, dilation[2]),
-                    )
-                ),
-            ]
-        )
-
-        self.convs2 = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-            ]
-        )
-
-    def forward(self, x):
-        """
-        Args:
-            x (Tensor): input tensor.
-        Returns:
-            Tensor: output tensor.
-        Shapes:
-            x: [B, C, T]
-        """
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
-            xt = c2(xt)
-            x = xt + x
-        return x
-
-    def remove_weight_norm(self):
-        for l in self.convs1:
-            remove_parametrizations(l, "weight")
-        for l in self.convs2:
-            remove_parametrizations(l, "weight")
-
-
-class ResBlock2(torch.nn.Module):
-    """Residual Block Type 2. It has 1 convolutional layers in each convolutional block.
-
-    Network::
-
-        x -> lrelu -> conv1-> -> z -> lrelu -> conv2-> o -> + -> o
-        |---------------------------------------------------|
-
-
-    Args:
-        channels (int): number of hidden channels for the convolutional layers.
-        kernel_size (int): size of the convolution filter in each layer.
-        dilations (list): list of dilation value for each conv layer in a block.
-    """
-
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
-        super().__init__()
-        self.convs = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-            ]
-        )
-
-    def forward(self, x):
-        for c in self.convs:
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c(xt)
-            x = xt + x
-        return x
-
-    def remove_weight_norm(self):
-        for l in self.convs:
-            remove_parametrizations(l, "weight")
-
-
-class HifiganGenerator(torch.nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        resblock_type,
-        resblock_dilation_sizes,
-        resblock_kernel_sizes,
-        upsample_kernel_sizes,
-        upsample_initial_channel,
-        upsample_factors,
-        inference_padding=5,
-        cond_channels=0,
-        conv_pre_weight_norm=True,
-        conv_post_weight_norm=True,
-        conv_post_bias=True,
-        cond_in_each_up_layer=False,
-    ):
-        r"""HiFiGAN Generator with Multi-Receptive Field Fusion (MRF)
-
-        Network:
-            x -> lrelu -> upsampling_layer -> resblock1_k1x1 -> z1 -> + -> z_sum / #resblocks -> lrelu -> conv_post_7x1 -> tanh -> o
-                                                 ..          -> zI ---|
-                                              resblockN_kNx1 -> zN ---'
-
-        Args:
-            in_channels (int): number of input tensor channels.
-            out_channels (int): number of output tensor channels.
-            resblock_type (str): type of the `ResBlock`. '1' or '2'.
-            resblock_dilation_sizes (List[List[int]]): list of dilation values in each layer of a `ResBlock`.
-            resblock_kernel_sizes (List[int]): list of kernel sizes for each `ResBlock`.
-            upsample_kernel_sizes (List[int]): list of kernel sizes for each transposed convolution.
-            upsample_initial_channel (int): number of channels for the first upsampling layer. This is divided by 2
-                for each consecutive upsampling layer.
-            upsample_factors (List[int]): upsampling factors (stride) for each upsampling layer.
-            inference_padding (int): constant padding applied to the input at inference time. Defaults to 5.
-        """
-        super().__init__()
-        self.inference_padding = inference_padding
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_factors)
-        self.cond_in_each_up_layer = cond_in_each_up_layer
-
-        # initial upsampling layers
-        self.conv_pre = weight_norm(Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3))
-        resblock = ResBlock1 if resblock_type == "1" else ResBlock2
-        # upsampling layers
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_factors, upsample_kernel_sizes)):
-            self.ups.append(
-                weight_norm(
-                    ConvTranspose1d(
-                        upsample_initial_channel // (2**i),
-                        upsample_initial_channel // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-        # MRF blocks
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-                self.resblocks.append(resblock(ch, k, d))
-        # post convolution layer
-        self.conv_post = weight_norm(Conv1d(ch, out_channels, 7, 1, padding=3, bias=conv_post_bias))
-        if cond_channels > 0:
-            self.cond_layer = nn.Conv1d(cond_channels, upsample_initial_channel, 1)
-
-        if not conv_pre_weight_norm:
-            remove_parametrizations(self.conv_pre, "weight")
-
-        if not conv_post_weight_norm:
-            remove_parametrizations(self.conv_post, "weight")
-
-        if self.cond_in_each_up_layer:
-            self.conds = nn.ModuleList()
-            for i in range(len(self.ups)):
-                ch = upsample_initial_channel // (2 ** (i + 1))
-                self.conds.append(nn.Conv1d(cond_channels, ch, 1))
-
-    def forward(self, x, g=None):
-        """
-        Args:
-            x (Tensor): feature input tensor.
-            g (Tensor): global conditioning input tensor.
-
-        Returns:
-            Tensor: output waveform.
-
-        Shapes:
-            x: [B, C, T]
-            Tensor: [B, 1, T]
-        """
-        o = self.conv_pre(x)
-        if hasattr(self, "cond_layer"):
-            o = o + self.cond_layer(g)
-        for i in range(self.num_upsamples):
-            o = F.leaky_relu(o, LRELU_SLOPE)
-            o = self.ups[i](o)
-
-            if self.cond_in_each_up_layer:
-                o = o + self.conds[i](g)
-
-            z_sum = None
-            for j in range(self.num_kernels):
-                if z_sum is None:
-                    z_sum = self.resblocks[i * self.num_kernels + j](o)
-                else:
-                    z_sum += self.resblocks[i * self.num_kernels + j](o)
-            o = z_sum / self.num_kernels
-        o = F.leaky_relu(o)
-        o = self.conv_post(o)
-        o = torch.tanh(o)
-        return o
-
-    @torch.no_grad()
-    def inference(self, c):
-        """
-        Args:
-            x (Tensor): conditioning input tensor.
-
-        Returns:
-            Tensor: output waveform.
-
-        Shapes:
-            x: [B, C, T]
-            Tensor: [B, 1, T]
-        """
-        c = c.to(self.conv_pre.weight.device)
-        c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate")
-        return self.forward(c)
-
-    def remove_weight_norm(self):
-        logger.info("Removing weight norm...")
-        for l in self.ups:
-            remove_parametrizations(l, "weight")
-        for l in self.resblocks:
-            l.remove_weight_norm()
-        remove_parametrizations(self.conv_pre, "weight")
-        remove_parametrizations(self.conv_post, "weight")
-
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"), weights_only=is_pytorch_at_least_2_4())
-        self.load_state_dict(state["model"])
-        if eval:
-            self.eval()
-            assert not self.training
-            self.remove_weight_norm()
-
-
-class SELayer(nn.Module):
-    def __init__(self, channel, reduction=8):
-        super(SELayer, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Sequential(
-            nn.Linear(channel, channel // reduction),
-            nn.ReLU(inplace=True),
-            nn.Linear(channel // reduction, channel),
-            nn.Sigmoid(),
-        )
-
-    def forward(self, x):
-        b, c, _, _ = x.size()
-        y = self.avg_pool(x).view(b, c)
-        y = self.fc(y).view(b, c, 1, 1)
-        return x * y
-
-
-class SEBasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
-        super(SEBasicBlock, self).__init__()
-        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.se = SELayer(planes, reduction)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.relu(out)
-        out = self.bn1(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.se(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-        return out
-
-
-def set_init_dict(model_dict, checkpoint_state, c):
-    # Partial initialization: if there is a mismatch with new and old layer, it is skipped.
-    for k, v in checkpoint_state.items():
-        if k not in model_dict:
-            logger.warning("Layer missing in the model definition: %s", k)
-    # 1. filter out unnecessary keys
-    pretrained_dict = {k: v for k, v in checkpoint_state.items() if k in model_dict}
-    # 2. filter out different size layers
-    pretrained_dict = {k: v for k, v in pretrained_dict.items() if v.numel() == model_dict[k].numel()}
-    # 3. skip reinit layers
-    if c.has("reinit_layers") and c.reinit_layers is not None:
-        for reinit_layer_name in c.reinit_layers:
-            pretrained_dict = {k: v for k, v in pretrained_dict.items() if reinit_layer_name not in k}
-    # 4. overwrite entries in the existing state dict
-    model_dict.update(pretrained_dict)
-    logger.info("%d / %d layers are restored.", len(pretrained_dict), len(model_dict))
-    return model_dict
-
-
-class PreEmphasis(nn.Module):
-    def __init__(self, coefficient=0.97):
-        super().__init__()
-        self.coefficient = coefficient
-        self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
-
-    def forward(self, x):
-        assert len(x.size()) == 2
-
-        x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
-        return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
-
-
-class ResNetSpeakerEncoder(nn.Module):
-    """This is copied from 🐸TTS to remove it from the dependencies."""
-
-    # pylint: disable=W0102
-    def __init__(
-        self,
-        input_dim=64,
-        proj_dim=512,
-        layers=[3, 4, 6, 3],
-        num_filters=[32, 64, 128, 256],
-        encoder_type="ASP",
-        log_input=False,
-        use_torch_spec=False,
-        audio_config=None,
-    ):
-        super(ResNetSpeakerEncoder, self).__init__()
-
-        self.encoder_type = encoder_type
-        self.input_dim = input_dim
-        self.log_input = log_input
-        self.use_torch_spec = use_torch_spec
-        self.audio_config = audio_config
-        self.proj_dim = proj_dim
-
-        self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
-        self.relu = nn.ReLU(inplace=True)
-        self.bn1 = nn.BatchNorm2d(num_filters[0])
-
-        self.inplanes = num_filters[0]
-        self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
-        self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
-        self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
-        self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
-
-        self.instancenorm = nn.InstanceNorm1d(input_dim)
-
-        if self.use_torch_spec:
-            self.torch_spec = torch.nn.Sequential(
-                PreEmphasis(audio_config["preemphasis"]),
-                torchaudio.transforms.MelSpectrogram(
-                    sample_rate=audio_config["sample_rate"],
-                    n_fft=audio_config["fft_size"],
-                    win_length=audio_config["win_length"],
-                    hop_length=audio_config["hop_length"],
-                    window_fn=torch.hamming_window,
-                    n_mels=audio_config["num_mels"],
-                ),
-            )
-
-        else:
-            self.torch_spec = None
-
-        outmap_size = int(self.input_dim / 8)
-
-        self.attention = nn.Sequential(
-            nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
-            nn.ReLU(),
-            nn.BatchNorm1d(128),
-            nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
-            nn.Softmax(dim=2),
-        )
-
-        if self.encoder_type == "SAP":
-            out_dim = num_filters[3] * outmap_size
-        elif self.encoder_type == "ASP":
-            out_dim = num_filters[3] * outmap_size * 2
-        else:
-            raise ValueError("Undefined encoder")
-
-        self.fc = nn.Linear(out_dim, proj_dim)
-
-        self._init_layers()
-
-    def _init_layers(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-
-    def create_layer(self, block, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for _ in range(1, blocks):
-            layers.append(block(self.inplanes, planes))
-
-        return nn.Sequential(*layers)
-
-    # pylint: disable=R0201
-    def new_parameter(self, *size):
-        out = nn.Parameter(torch.FloatTensor(*size))
-        nn.init.xavier_normal_(out)
-        return out
-
-    def forward(self, x, l2_norm=False):
-        """Forward pass of the model.
-
-        Args:
-            x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
-                to compute the spectrogram on-the-fly.
-            l2_norm (bool): Whether to L2-normalize the outputs.
-
-        Shapes:
-            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
-        """
-        x.squeeze_(1)
-        # if you torch spec compute it otherwise use the mel spec computed by the AP
-        if self.use_torch_spec:
-            x = self.torch_spec(x)
-
-        if self.log_input:
-            x = (x + 1e-6).log()
-        x = self.instancenorm(x).unsqueeze(1)
-
-        x = self.conv1(x)
-        x = self.relu(x)
-        x = self.bn1(x)
-
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-
-        x = x.reshape(x.size()[0], -1, x.size()[-1])
-
-        w = self.attention(x)
-
-        if self.encoder_type == "SAP":
-            x = torch.sum(x * w, dim=2)
-        elif self.encoder_type == "ASP":
-            mu = torch.sum(x * w, dim=2)
-            sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
-            x = torch.cat((mu, sg), 1)
-
-        x = x.view(x.size()[0], -1)
-        x = self.fc(x)
-
-        if l2_norm:
-            x = torch.nn.functional.normalize(x, p=2, dim=1)
-        return x
-
-    def load_checkpoint(
-        self,
-        checkpoint_path: str,
-        eval: bool = False,
-        use_cuda: bool = False,
-        criterion=None,
-        cache=False,
-    ):
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
-        try:
-            self.load_state_dict(state["model"])
-            logger.info("Model fully restored.")
-        except (KeyError, RuntimeError) as error:
-            # If eval raise the error
-            if eval:
-                raise error
-
-            logger.info("Partial model initialization.")
-            model_dict = self.state_dict()
-            model_dict = set_init_dict(model_dict, state["model"])
-            self.load_state_dict(model_dict)
-            del model_dict
-
-        # load the criterion for restore_path
-        if criterion is not None and "criterion" in state:
-            try:
-                criterion.load_state_dict(state["criterion"])
-            except (KeyError, RuntimeError) as error:
-                logger.exception("Criterion load ignored because of: %s", error)
-
-        if use_cuda:
-            self.cuda()
-            if criterion is not None:
-                criterion = criterion.cuda()
-
-        if eval:
-            self.eval()
-            assert not self.training
-
-        if not eval:
-            return criterion, state["step"]
-        return criterion
-
 
 class HifiDecoder(torch.nn.Module):
     def __init__(
@@ -702,7 +97,7 @@ def forward(self, latents, g=None):
         o = self.waveform_decoder(z, g=g)
         return o
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, c, g):
         """
         Args:
diff --git a/TTS/tts/layers/xtts/latent_encoder.py b/TTS/tts/layers/xtts/latent_encoder.py
deleted file mode 100644
index f9d62a36f1..0000000000
--- a/TTS/tts/layers/xtts/latent_encoder.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# ported from: Originally ported from: https://github.com/neonbjb/tortoise-tts
-
-import math
-
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-
-class GroupNorm32(nn.GroupNorm):
-    def forward(self, x):
-        return super().forward(x.float()).type(x.dtype)
-
-
-def conv_nd(dims, *args, **kwargs):
-    if dims == 1:
-        return nn.Conv1d(*args, **kwargs)
-    elif dims == 2:
-        return nn.Conv2d(*args, **kwargs)
-    elif dims == 3:
-        return nn.Conv3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
-
-
-def normalization(channels):
-    groups = 32
-    if channels <= 16:
-        groups = 8
-    elif channels <= 64:
-        groups = 16
-    while channels % groups != 0:
-        groups = int(groups / 2)
-    assert groups > 2
-    return GroupNorm32(groups, channels)
-
-
-def zero_module(module):
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-
-
-class QKVAttention(nn.Module):
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-
-    def forward(self, qkv, mask=None, qk_bias=0):
-        """
-        Apply QKV attention.
-
-        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = torch.einsum("bct,bcs->bts", q * scale, k * scale)  # More stable with f16 than dividing afterwards
-        weight = weight + qk_bias
-        if mask is not None:
-            mask = mask.repeat(self.n_heads, 1, 1)
-            weight[mask.logical_not()] = -torch.inf
-        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = torch.einsum("bts,bcs->bct", weight, v)
-
-        return a.reshape(bs, -1, length)
-
-
-class AttentionBlock(nn.Module):
-    """An attention block that allows spatial positions to attend to each other."""
-
-    def __init__(
-        self,
-        channels,
-        num_heads=1,
-        num_head_channels=-1,
-        out_channels=None,
-        do_activation=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        out_channels = channels if out_channels is None else out_channels
-        self.do_activation = do_activation
-        if num_head_channels == -1:
-            self.num_heads = num_heads
-        else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
-            self.num_heads = channels // num_head_channels
-        self.norm = normalization(channels)
-        self.qkv = conv_nd(1, channels, out_channels * 3, 1)
-        self.attention = QKVAttention(self.num_heads)
-
-        self.x_proj = nn.Identity() if out_channels == channels else conv_nd(1, channels, out_channels, 1)
-        self.proj_out = zero_module(conv_nd(1, out_channels, out_channels, 1))
-
-    def forward(self, x, mask=None, qk_bias=0):
-        b, c, *spatial = x.shape
-        if mask is not None:
-            if len(mask.shape) == 2:
-                mask = mask.unsqueeze(0).repeat(x.shape[0], 1, 1)
-            if mask.shape[1] != x.shape[-1]:
-                mask = mask[:, : x.shape[-1], : x.shape[-1]]
-
-        x = x.reshape(b, c, -1)
-        x = self.norm(x)
-        if self.do_activation:
-            x = F.silu(x, inplace=True)
-        qkv = self.qkv(x)
-        h = self.attention(qkv, mask=mask, qk_bias=qk_bias)
-        h = self.proj_out(h)
-        xp = self.x_proj(x)
-        return (xp + h).reshape(b, xp.shape[1], *spatial)
-
-
-class ConditioningEncoder(nn.Module):
-    def __init__(
-        self,
-        spec_dim,
-        embedding_dim,
-        attn_blocks=6,
-        num_attn_heads=4,
-    ):
-        super().__init__()
-        attn = []
-        self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1)
-        for a in range(attn_blocks):
-            attn.append(AttentionBlock(embedding_dim, num_attn_heads))
-        self.attn = nn.Sequential(*attn)
-        self.dim = embedding_dim
-
-    def forward(self, x):
-        """
-        x: (b, 80, s)
-        """
-        h = self.init(x)
-        h = self.attn(h)
-        return h
diff --git a/TTS/tts/layers/xtts/perceiver_encoder.py b/TTS/tts/layers/xtts/perceiver_encoder.py
index f4b6e84123..7477087283 100644
--- a/TTS/tts/layers/xtts/perceiver_encoder.py
+++ b/TTS/tts/layers/xtts/perceiver_encoder.py
@@ -9,9 +9,8 @@
 from einops.layers.torch import Rearrange
 from torch import einsum, nn
 
-
-def exists(val):
-    return val is not None
+from TTS.tts.layers.tortoise.transformer import GEGLU
+from TTS.utils.generic_utils import default, exists
 
 
 def once(fn):
@@ -151,12 +150,6 @@ def Sequential(*mods):
     return nn.Sequential(*filter(exists, mods))
 
 
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if callable(d) else d
-
-
 class RMSNorm(nn.Module):
     def __init__(self, dim, scale=True, dim_cond=None):
         super().__init__()
@@ -194,12 +187,6 @@ def forward(self, x):
         return super().forward(causal_padded_x)
 
 
-class GEGLU(nn.Module):
-    def forward(self, x):
-        x, gate = x.chunk(2, dim=-1)
-        return F.gelu(gate) * x
-
-
 def FeedForward(dim, mult=4, causal_conv=False):
     dim_inner = int(dim * mult * 2 / 3)
 
diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py
index 44cf940c69..9343f656e1 100644
--- a/TTS/tts/layers/xtts/stream_generator.py
+++ b/TTS/tts/layers/xtts/stream_generator.py
@@ -4,7 +4,7 @@
 import inspect
 import random
 import warnings
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import numpy as np
 import torch
@@ -45,18 +45,18 @@ def __init__(self, **kwargs):
 
 
 class NewGenerationMixin(GenerationMixin):
-    @torch.no_grad()
+    @torch.inference_mode()
     def generate(  # noqa: PLR0911
         self,
-        inputs: Optional[torch.Tensor] = None,
-        generation_config: Optional[StreamGenerationConfig] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
-        synced_gpus: Optional[bool] = False,
+        inputs: torch.Tensor | None = None,
+        generation_config: StreamGenerationConfig | None = None,
+        logits_processor: LogitsProcessorList | None = None,
+        stopping_criteria: StoppingCriteriaList | None = None,
+        prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], list[int]] | None = None,
+        synced_gpus: bool | None = False,
         seed: int = 0,
         **kwargs,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
+    ) -> GenerateOutput | torch.LongTensor:
         r"""
 
         Generates sequences of token ids for models with a language modeling head.
@@ -207,8 +207,8 @@ def generate(  # noqa: PLR0911
             )
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
                 inputs_tensor,
-                generation_config._pad_token_tensor,
-                generation_config._eos_token_tensor,
+                generation_config,
+                model_kwargs,
             )
 
         # decoder-only models should use left-padding for generation
@@ -662,23 +662,23 @@ def typeerror():
                 **model_kwargs,
             )
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def sample_stream(
         self,
         input_ids: torch.LongTensor,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        logits_warper: Optional[LogitsProcessorList] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, list[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: Optional[bool] = False,
+        logits_processor: LogitsProcessorList | None = None,
+        stopping_criteria: StoppingCriteriaList | None = None,
+        logits_warper: LogitsProcessorList | None = None,
+        max_length: int | None = None,
+        pad_token_id: int | None = None,
+        eos_token_id: int | list[int] | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        output_scores: bool | None = None,
+        return_dict_in_generate: bool | None = None,
+        synced_gpus: bool | None = False,
         **model_kwargs,
-    ) -> Union[SampleOutput, torch.LongTensor]:
+    ) -> SampleOutput | torch.LongTensor:
         r"""
         Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
         can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
@@ -953,7 +953,6 @@ def init_stream_support():
 
 
 def _get_logits_warper(generation_config: GenerationConfig) -> LogitsProcessorList:
-
     warpers = LogitsProcessorList()
 
     if generation_config.temperature is not None and generation_config.temperature != 1.0:
diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py
index cf80d8cff3..ee7989407e 100644
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@@ -16,6 +16,7 @@
 from tokenizers import Tokenizer
 
 from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
+from TTS.tts.utils.text.cleaners import collapse_whitespace, lowercase
 
 logger = logging.getLogger(__name__)
 
@@ -73,12 +74,10 @@ def split_sentence(text, lang, text_split_length=250):
     return text_splits
 
 
-_whitespace_re = re.compile(r"\s+")
-
 # List of (regular expression, replacement) pairs for abbreviations:
 _abbreviations = {
     "en": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("mrs", "misess"),
             ("mr", "mister"),
@@ -101,7 +100,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "es": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("sra", "señora"),
             ("sr", "señor"),
@@ -114,7 +113,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "fr": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("mme", "madame"),
             ("mr", "monsieur"),
@@ -126,7 +125,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "de": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("fr", "frau"),
             ("dr", "doktor"),
@@ -136,7 +135,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "pt": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("sra", "senhora"),
             ("sr", "senhor"),
@@ -149,7 +148,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "it": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             # ("sig.ra", "signora"),
             ("sig", "signore"),
@@ -161,7 +160,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "pl": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("p", "pani"),
             ("m", "pan"),
@@ -171,19 +170,19 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "ar": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             # There are not many common abbreviations in Arabic as in English.
         ]
     ],
     "zh": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
         ]
     ],
     "cs": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("dr", "doktor"),  # doctor
             ("ing", "inženýr"),  # engineer
@@ -192,7 +191,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "ru": [
-        (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\b", re.IGNORECASE), x[1])
         for x in [
             ("г-жа", "госпожа"),  # Mrs.
             ("г-н", "господин"),  # Mr.
@@ -201,7 +200,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "nl": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("dhr", "de heer"),  # Mr.
             ("mevr", "mevrouw"),  # Mrs.
@@ -211,7 +210,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "tr": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("b", "bay"),  # Mr.
             ("byk", "büyük"),  # büyük
@@ -220,7 +219,7 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "hu": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             ("dr", "doktor"),  # doctor
             ("b", "bácsi"),  # Mr.
@@ -229,13 +228,13 @@ def split_sentence(text, lang, text_split_length=250):
         ]
     ],
     "ko": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
         ]
     ],
     "hi": [
-        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
         for x in [
             # Hindi doesn't typically use abbreviations in the same way as Latin-based scripts.
         ]
@@ -261,7 +260,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
 
 _symbols_multilingual = {
     "en": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " and "),
             ("@", " at "),
@@ -273,7 +272,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "es": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " y "),
             ("@", " arroba "),
@@ -285,7 +284,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "fr": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " et "),
             ("@", " arobase "),
@@ -297,7 +296,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "de": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " und "),
             ("@", " at "),
@@ -309,7 +308,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "pt": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " e "),
             ("@", " arroba "),
@@ -321,7 +320,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "it": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " e "),
             ("@", " chiocciola "),
@@ -333,7 +332,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "pl": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " i "),
             ("@", " małpa "),
@@ -346,7 +345,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
     ],
     "ar": [
         # Arabic
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " و "),
             ("@", " على "),
@@ -359,7 +358,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
     ],
     "zh": [
         # Chinese
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " 和 "),
             ("@", " 在 "),
@@ -372,7 +371,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
     ],
     "cs": [
         # Czech
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " a "),
             ("@", " na "),
@@ -385,7 +384,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
     ],
     "ru": [
         # Russian
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " и "),
             ("@", " собака "),
@@ -398,7 +397,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
     ],
     "nl": [
         # Dutch
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " en "),
             ("@", " bij "),
@@ -410,7 +409,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "tr": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " ve "),
             ("@", " at "),
@@ -422,7 +421,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "hu": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " és "),
             ("@", " kukac "),
@@ -435,7 +434,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
     ],
     "ko": [
         # Korean
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " 그리고 "),
             ("@", " 에 "),
@@ -447,7 +446,7 @@ def expand_abbreviations_multilingual(text, lang="en"):
         ]
     ],
     "hi": [
-        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1])
         for x in [
             ("&", " और "),
             ("@", " ऐट दी रेट "),
@@ -528,12 +527,12 @@ def _remove_dots(m):
 
 def _expand_decimal_point(m, lang="en"):
     amount = m.group(1).replace(",", ".")
-    return num2words(float(amount), lang=lang if lang != "cs" else "cz")
+    return num2words(float(amount), lang=lang)
 
 
 def _expand_currency(m, lang="en", currency="USD"):
-    amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
-    full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz")
+    amount = float(re.sub(r"[^\d.]", "", m.group(0).replace(",", ".")))
+    full_amount = num2words(amount, to="currency", currency=currency, lang=lang)
 
     and_equivalents = {
         "en": ", ",
@@ -564,11 +563,11 @@ def _expand_currency(m, lang="en", currency="USD"):
 
 
 def _expand_ordinal(m, lang="en"):
-    return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
+    return num2words(int(m.group(1)), ordinal=True, lang=lang)
 
 
 def _expand_number(m, lang="en"):
-    return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
+    return num2words(int(m.group(0)), lang=lang)
 
 
 def expand_numbers_multilingual(text, lang="en"):
@@ -592,14 +591,6 @@ def expand_numbers_multilingual(text, lang="en"):
     return text
 
 
-def lowercase(text):
-    return text.lower()
-
-
-def collapse_whitespace(text):
-    return re.sub(_whitespace_re, " ", text)
-
-
 def multilingual_cleaners(text, lang):
     text = text.replace('"', "")
     if lang == "tr":
@@ -614,13 +605,6 @@ def multilingual_cleaners(text, lang):
     return text
 
 
-def basic_cleaners(text):
-    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-
-
 def chinese_transliterate(text):
     try:
         import pypinyin
diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py
index 0253d65ddd..edd8fc4b65 100644
--- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py
+++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py
@@ -1,6 +1,5 @@
 import logging
 from dataclasses import dataclass, field
-from typing import Dict, List, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -18,7 +17,7 @@
 from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer
 from TTS.tts.layers.xtts.trainer.dataset import XTTSDataset
 from TTS.tts.models.base_tts import BaseTTS
-from TTS.tts.models.xtts import Xtts, XttsArgs, XttsAudioConfig
+from TTS.tts.models.xtts import Xtts, XttsArgs
 from TTS.utils.generic_utils import is_pytorch_at_least_2_4
 
 logger = logging.getLogger(__name__)
@@ -31,12 +30,7 @@ class GPTTrainerConfig(XttsConfig):
     optimizer_wd_only_on_weights: bool = False
     weighted_loss_attrs: dict = field(default_factory=lambda: {})
     weighted_loss_multipliers: dict = field(default_factory=lambda: {})
-    test_sentences: List[dict] = field(default_factory=lambda: [])
-
-
-@dataclass
-class XttsAudioConfig(XttsAudioConfig):
-    dvae_sample_rate: int = 22050
+    test_sentences: list[dict] = field(default_factory=lambda: [])
 
 
 @dataclass
@@ -202,10 +196,6 @@ def __init__(self, config: Coqpit):
             mel_norm_file=self.args.mel_norm_file, sampling_rate=config.audio.dvae_sample_rate
         )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens):
         """
         Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
@@ -230,8 +220,8 @@ def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels
         )
         return losses
 
-    @torch.no_grad()
-    def test_run(self, assets) -> Tuple[Dict, Dict]:  # pylint: disable=W0613
+    @torch.inference_mode()
+    def test_run(self, assets) -> tuple[dict, dict]:  # pylint: disable=W0613
         test_audios = {}
         if self.config.test_sentences:
             # init gpt for inference mode
@@ -246,7 +236,7 @@ def test_run(self, assets) -> Tuple[Dict, Dict]:  # pylint: disable=W0613
                     s_info["language"],
                     gpt_cond_len=3,
                 )["wav"]
-                test_audios["{}-audio".format(idx)] = wav
+                test_audios[f"{idx}-audio"] = wav
 
             # delete inference layers
             del self.xtts.gpt.gpt_inference
@@ -254,11 +244,15 @@ def test_run(self, assets) -> Tuple[Dict, Dict]:  # pylint: disable=W0613
         return {"audios": test_audios}
 
     def test_log(
-        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         logger.test_audios(steps, outputs["audios"], self.args.output_sample_rate)
 
-    def format_batch(self, batch: Dict) -> Dict:
+    def format_batch(self, batch: dict) -> dict:
         return batch
 
     @torch.no_grad()  # torch no grad to avoid gradients from the pre-processing and DVAE codes extraction
@@ -340,7 +334,7 @@ def on_init_end(self, trainer):  # pylint: disable=W0613
 
             WeightsFileHandler.add_pre_callback(callback_clearml_load_save)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         x,
@@ -360,12 +354,12 @@ def get_sampler(self, dataset: TTSDataset, num_gpus=1):
     def get_data_loader(
         self,
         config: Coqpit,
-        assets: Dict,
+        assets: dict,
         is_eval: bool,
-        samples: Union[List[Dict], List[List]],
+        samples: list[dict] | list[list],
         verbose: bool,
         num_gpus: int,
-        rank: int = None,
+        rank: int | None = None,
     ) -> "DataLoader":  # pylint: disable=W0613
         if is_eval and not config.run_eval:
             loader = None
@@ -405,7 +399,7 @@ def get_data_loader(
                 )
         return loader
 
-    def get_optimizer(self) -> List:
+    def get_optimizer(self) -> list:
         """Initiate and return the optimizer based on the config parameters."""
         # ToDo: deal with multi GPU training
         if self.config.optimizer_wd_only_on_weights:
@@ -436,7 +430,7 @@ def get_optimizer(self) -> List:
                     v.is_norm = isinstance(m, norm_modules)
                     v.is_emb = isinstance(m, emb_modules)
 
-                    fpn = "%s.%s" % (mn, k) if mn else k  # full param name
+                    fpn = f"{mn}.{k}" if mn else k  # full param name
                     all_param_names.add(fpn)
                     param_map[fpn] = v
                     if v.is_bias or v.is_norm or v.is_emb:
@@ -469,7 +463,7 @@ def get_optimizer(self) -> List:
             parameters=self.xtts.gpt.parameters(),
         )
 
-    def get_scheduler(self, optimizer) -> List:
+    def get_scheduler(self, optimizer) -> list:
         """Set the scheduler for the optimizer.
 
         Args:
@@ -500,7 +494,7 @@ def load_checkpoint(
             assert not self.training
 
     @staticmethod
-    def init_from_config(config: "GPTTrainerConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "GPTTrainerConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/layers/xtts/zh_num2words.py b/TTS/tts/layers/xtts/zh_num2words.py
index 69b8dae952..360d9b06c8 100644
--- a/TTS/tts/layers/xtts/zh_num2words.py
+++ b/TTS/tts/layers/xtts/zh_num2words.py
@@ -392,7 +392,7 @@
 # ================================================================================ #
 #                                    basic class
 # ================================================================================ #
-class ChineseChar(object):
+class ChineseChar:
     """
     中文字符
     每个字符对应简体和繁体,
@@ -420,13 +420,13 @@ class ChineseNumberUnit(ChineseChar):
     """
 
     def __init__(self, power, simplified, traditional, big_s, big_t):
-        super(ChineseNumberUnit, self).__init__(simplified, traditional)
+        super().__init__(simplified, traditional)
         self.power = power
         self.big_s = big_s
         self.big_t = big_t
 
     def __str__(self):
-        return "10^{}".format(self.power)
+        return f"10^{self.power}"
 
     @classmethod
     def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
@@ -447,7 +447,7 @@ def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=Fals
                 power=pow(2, index + 3), simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]
             )
         else:
-            raise ValueError("Counting type should be in {0} ({1} provided).".format(NUMBERING_TYPES, numbering_type))
+            raise ValueError(f"Counting type should be in {NUMBERING_TYPES} ({numbering_type} provided).")
 
 
 class ChineseNumberDigit(ChineseChar):
@@ -456,7 +456,7 @@ class ChineseNumberDigit(ChineseChar):
     """
 
     def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
-        super(ChineseNumberDigit, self).__init__(simplified, traditional)
+        super().__init__(simplified, traditional)
         self.value = value
         self.big_s = big_s
         self.big_t = big_t
@@ -477,7 +477,7 @@ class ChineseMath(ChineseChar):
     """
 
     def __init__(self, simplified, traditional, symbol, expression=None):
-        super(ChineseMath, self).__init__(simplified, traditional)
+        super().__init__(simplified, traditional)
         self.symbol = symbol
         self.expression = expression
         self.big_s = simplified
@@ -487,13 +487,13 @@ def __init__(self, simplified, traditional, symbol, expression=None):
 CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
 
 
-class NumberSystem(object):
+class NumberSystem:
     """
     中文数字系统
     """
 
 
-class MathSymbol(object):
+class MathSymbol:
     """
     用于中文数字系统的数学符号 (繁/简体), e.g.
     positive = ['正', '正']
@@ -507,8 +507,7 @@ def __init__(self, positive, negative, point):
         self.point = point
 
     def __iter__(self):
-        for v in self.__dict__.values():
-            yield v
+        yield from self.__dict__.values()
 
 
 # class OtherSymbol(object):
@@ -640,7 +639,7 @@ def compute_value(integer_symbols):
     int_str = str(compute_value(int_part))
     dec_str = "".join([str(d.value) for d in dec_part])
     if dec_part:
-        return "{0}.{1}".format(int_str, dec_str)
+        return f"{int_str}.{dec_str}"
     else:
         return int_str
 
@@ -686,7 +685,7 @@ def get_value(value_string, use_zeros=True):
         int_string = int_dec[0]
         dec_string = int_dec[1]
     else:
-        raise ValueError("invalid input num string with more than one dot: {}".format(number_string))
+        raise ValueError(f"invalid input num string with more than one dot: {number_string}")
 
     if use_units and len(int_string) > 1:
         result_symbols = get_value(int_string)
@@ -702,7 +701,7 @@ def get_value(value_string, use_zeros=True):
             if isinstance(v, CND) and v.value == 2:
                 next_symbol = result_symbols[i + 1] if i < len(result_symbols) - 1 else None
                 previous_symbol = result_symbols[i - 1] if i > 0 else None
-                if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))):
+                if isinstance(next_symbol, CNU) and isinstance(previous_symbol, CNU | type(None)):
                     if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)):
                         result_symbols[i] = liang
 
@@ -1166,7 +1165,7 @@ def __call__(self, text):
     )
 
     ndone = 0
-    with open(args.ifile, "r", encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream:
+    with open(args.ifile, encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream:
         if args.format == "tsv":
             reader = csv.DictReader(istream, delimiter="\t")
             assert "TEXT" in reader.fieldnames
diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py
index ebfa171c80..4746b13ea2 100644
--- a/TTS/tts/models/__init__.py
+++ b/TTS/tts/models/__init__.py
@@ -1,12 +1,11 @@
 import logging
-from typing import Dict, List, Union
 
 from TTS.utils.generic_utils import find_module
 
 logger = logging.getLogger(__name__)
 
 
-def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS":
+def setup_model(config: "Coqpit", samples: list[list] | list[dict] = None) -> "BaseTTS":
     logger.info("Using model: %s", config.model)
     # fetch the right model implementation.
     if "base_model" in config and config["base_model"] is not None:
diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py
index 1c3d57582e..c2e29c7100 100644
--- a/TTS/tts/models/align_tts.py
+++ b/TTS/tts/models/align_tts.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import Dict, List, Union
 
 import torch
 from coqpit import Coqpit
@@ -13,7 +12,7 @@
 from TTS.tts.layers.feed_forward.encoder import Encoder
 from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
 from TTS.tts.models.base_tts import BaseTTS
-from TTS.tts.utils.helpers import generate_path, sequence_mask
+from TTS.tts.utils.helpers import expand_encoder_outputs, generate_attention, sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
@@ -169,35 +168,6 @@ def compute_align_path(self, mu, log_sigma, y, x_mask, y_mask):
         dr_mas = torch.sum(attn, -1)
         return dr_mas.squeeze(1), log_p
 
-    @staticmethod
-    def generate_attn(dr, x_mask, y_mask=None):
-        # compute decode mask from the durations
-        if y_mask is None:
-            y_lengths = dr.sum(1).long()
-            y_lengths[y_lengths < 1] = 1
-            y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype)
-        attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
-        attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
-        return attn
-
-    def expand_encoder_outputs(self, en, dr, x_mask, y_mask):
-        """Generate attention alignment map from durations and
-        expand encoder outputs
-
-        Examples::
-            - encoder output: [a,b,c,d]
-            - durations: [1, 3, 2, 1]
-
-            - expanded: [a, b, b, b, c, c, d]
-            - attention map: [[0, 0, 0, 0, 0, 0, 1],
-                             [0, 0, 0, 0, 1, 1, 0],
-                             [0, 1, 1, 1, 0, 0, 0],
-                             [1, 0, 0, 0, 0, 0, 0]]
-        """
-        attn = self.generate_attn(dr, x_mask, y_mask)
-        o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2)
-        return o_en_ex, attn
-
     def format_durations(self, o_dr_log, x_mask):
         o_dr = (torch.exp(o_dr_log) - 1) * x_mask * self.length_scale
         o_dr[o_dr < 1] = 1.0
@@ -243,9 +213,8 @@ def _forward_encoder(self, x, x_lengths, g=None):
         return o_en, o_en_dp, x_mask, g
 
     def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g):
-        y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype)
         # expand o_en with durations
-        o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask)
+        o_en_ex, attn, y_mask = expand_encoder_outputs(o_en, dr, x_mask, y_lengths)
         # positional encoding
         if hasattr(self, "pos_encoder"):
             o_en_ex = self.pos_encoder(o_en_ex, y_mask)
@@ -263,9 +232,7 @@ def _forward_mdn(self, o_en, y, y_lengths, x_mask):
         dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, y_mask)
         return dr_mas, mu, log_sigma, logp
 
-    def forward(
-        self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None
-    ):  # pylint: disable=unused-argument
+    def forward(self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None):  # pylint: disable=unused-argument
         """
         Shapes:
             - x: :math:`[B, T_max]`
@@ -282,7 +249,7 @@ def forward(
             o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
             dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask)
             y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype)
-            attn = self.generate_attn(dr_mas, x_mask, y_mask)
+            attn = generate_attention(dr_mas, x_mask, y_mask)
         elif phase == 1:
             # train decoder
             o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
@@ -318,7 +285,7 @@ def forward(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, x, aux_input={"d_vectors": None}):  # pylint: disable=unused-argument
         """
         Shapes:
@@ -382,9 +349,7 @@ def _create_logs(self, batch, outputs, ap):  # pylint: disable=no-self-use
         train_audio = ap.inv_melspectrogram(pred_spec.T)
         return figures, {"audio": train_audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ) -> None:  # pylint: disable=no-self-use
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:  # pylint: disable=no-self-use
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
@@ -397,9 +362,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
@@ -433,7 +396,7 @@ def on_epoch_start(self, trainer):
         self.phase = self._set_phase(trainer.config, trainer.total_steps_done)
 
     @staticmethod
-    def init_from_config(config: "AlignTTSConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "AlignTTSConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py
index ced8f60ed8..84814745a2 100644
--- a/TTS/tts/models/bark.py
+++ b/TTS/tts/models/bark.py
@@ -1,6 +1,6 @@
 import os
 from dataclasses import dataclass
-from typing import Optional
+from pathlib import Path
 
 import numpy as np
 from coqpit import Coqpit
@@ -42,10 +42,6 @@ def __init__(
         self.encodec = EncodecModel.encodec_model_24khz()
         self.encodec.set_target_bandwidth(6.0)
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def load_bark_models(self):
         self.semantic_model, self.config = load_model(
             ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text"
@@ -68,7 +64,7 @@ def train_step(
     def text_to_semantic(
         self,
         text: str,
-        history_prompt: Optional[str] = None,
+        history_prompt: str | None = None,
         temp: float = 0.7,
         base=None,
         allow_early_stop=True,
@@ -98,7 +94,7 @@ def text_to_semantic(
     def semantic_to_waveform(
         self,
         semantic_tokens: np.ndarray,
-        history_prompt: Optional[str] = None,
+        history_prompt: str | None = None,
         temp: float = 0.7,
         base=None,
     ):
@@ -132,7 +128,7 @@ def semantic_to_waveform(
     def generate_audio(
         self,
         text: str,
-        history_prompt: Optional[str] = None,
+        history_prompt: str | None = None,
         text_temp: float = 0.7,
         waveform_temp: float = 0.7,
         base=None,
@@ -194,9 +190,7 @@ def _set_voice_dirs(self, voice_dirs):
         return _voice_dirs
 
     # TODO: remove config from synthesize
-    def synthesize(
-        self, text, config, speaker_id="random", voice_dirs=None, **kwargs
-    ):  # pylint: disable=unused-argument
+    def synthesize(self, text, config, speaker_id="random", voice_dirs=None, **kwargs):  # pylint: disable=unused-argument
         """Synthesize speech with the given input text.
 
         Args:
@@ -206,12 +200,14 @@ def synthesize(
             speaker_wav (str): Path to the speaker audio file for cloning a new voice. It is cloned and saved in
                 `voice_dirs` with the name `speaker_id`. Defaults to None.
             voice_dirs (List[str]): List of paths that host reference audio files for speakers. Defaults to None.
-            **kwargs: Model specific inference settings used by `generate_audio()` and `TTS.tts.layers.bark.inference_funcs.generate_text_semantic().
+            **kwargs: Model specific inference settings used by `generate_audio()` and
+                      `TTS.tts.layers.bark.inference_funcs.generate_text_semantic()`.
 
         Returns:
-            A dictionary of the output values with `wav` as output waveform, `deterministic_seed` as seed used at inference,
-            `text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
-            as latents used at inference.
+            A dictionary of the output values with `wav` as output waveform,
+            `deterministic_seed` as seed used at inference, `text_input` as text token IDs
+            after tokenizer, `voice_samples` as samples used for cloning,
+            `conditioning_latents` as latents used at inference.
 
         """
         speaker_id = "random" if speaker_id is None else speaker_id
@@ -267,10 +263,12 @@ def load_checkpoint(
         fine_model_path = fine_model_path or os.path.join(checkpoint_dir, "fine_2.pt")
         hubert_tokenizer_path = hubert_tokenizer_path or os.path.join(checkpoint_dir, "tokenizer.pth")
 
+        # The paths in the default config start with /root/.local/share/tts and need to be fixed
         self.config.LOCAL_MODEL_PATHS["text"] = text_model_path
         self.config.LOCAL_MODEL_PATHS["coarse"] = coarse_model_path
         self.config.LOCAL_MODEL_PATHS["fine"] = fine_model_path
         self.config.LOCAL_MODEL_PATHS["hubert_tokenizer"] = hubert_tokenizer_path
+        self.config.CACHE_DIR = str(Path(text_model_path).parent)
 
         self.load_bark_models()
 
diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py
index 79cdf1a7d4..05f4ae168d 100644
--- a/TTS/tts/models/base_tacotron.py
+++ b/TTS/tts/models/base_tacotron.py
@@ -1,7 +1,6 @@
 import copy
 import logging
 from abc import abstractmethod
-from typing import Dict, Tuple
 
 import torch
 from coqpit import Coqpit
@@ -62,7 +61,7 @@ def __init__(
         self.coarse_decoder = None
 
     @staticmethod
-    def _format_aux_input(aux_input: Dict) -> Dict:
+    def _format_aux_input(aux_input: dict) -> dict:
         """Set missing fields to their default values"""
         if aux_input:
             return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input)
@@ -94,9 +93,7 @@ def forward(self):
     def inference(self):
         pass
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         """Load model checkpoint and set up internals.
 
         Args:
@@ -141,7 +138,7 @@ def init_from_config(config: Coqpit):
     # TEST AND LOG FUNCTIONS #
     ##########################
 
-    def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
+    def test_run(self, assets: dict) -> tuple[dict, dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
         You can override this for a different behaviour.
@@ -169,17 +166,19 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
                 use_griffin_lim=True,
                 do_trim_silence=False,
             )
-            test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
-            test_figures["{}-prediction".format(idx)] = plot_spectrogram(
+            test_audios[f"{idx}-audio"] = outputs_dict["wav"]
+            test_figures[f"{idx}-prediction"] = plot_spectrogram(
                 outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
             )
-            test_figures["{}-alignment".format(idx)] = plot_alignment(
-                outputs_dict["outputs"]["alignments"], output_fig=False
-            )
+            test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False)
         return {"figures": test_figures, "audios": test_audios}
 
     def test_log(
-        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         logger.test_audios(steps, outputs["audios"], self.ap.sample_rate)
         logger.test_figures(steps, outputs["figures"])
diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index ccb023ce84..95cbf5bbf5 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -1,7 +1,6 @@
 import logging
 import os
 import random
-from typing import Dict, List, Tuple, Union
 
 import torch
 import torch.distributed as dist
@@ -79,16 +78,18 @@ def _set_model_args(self, config: Coqpit):
         else:
             raise ValueError("config must be either a *Config or *Args")
 
-    def init_multispeaker(self, config: Coqpit, data: List = None):
-        """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining
-        `in_channels` size of the connected layers.
+    def init_multispeaker(self, config: Coqpit, data: list = None):
+        """Set up for multi-speaker TTS.
+
+        Initialize a speaker embedding layer if needed and define expected embedding
+        channel size for defining `in_channels` size of the connected layers.
 
         This implementation yields 3 possible outcomes:
 
-        1. If `config.use_speaker_embedding` and `config.use_d_vector_file are False, do nothing.
+        1. If `config.use_speaker_embedding` and `config.use_d_vector_file` are False, do nothing.
         2. If `config.use_d_vector_file` is True, set expected embedding channel size to `config.d_vector_dim` or 512.
         3. If `config.use_speaker_embedding`, initialize a speaker embedding layer with channel size of
-        `config.d_vector_dim` or 512.
+           `config.d_vector_dim` or 512.
 
         You can override this function for new models.
 
@@ -112,7 +113,7 @@ def init_multispeaker(self, config: Coqpit, data: List = None):
             self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
             self.speaker_embedding.weight.data.normal_(0, 0.3)
 
-    def get_aux_input(self, **kwargs) -> Dict:
+    def get_aux_input(self, **kwargs) -> dict:
         """Prepare and return `aux_input` used by `forward()`"""
         return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None}
 
@@ -163,7 +164,7 @@ def get_aux_input_from_test_sentences(self, sentence_info):
             "language_id": language_id,
         }
 
-    def format_batch(self, batch: Dict) -> Dict:
+    def format_batch(self, batch: dict) -> dict:
         """Generic batch formatting for `TTSDataset`.
 
         You must override this if you use a custom dataset.
@@ -209,9 +210,9 @@ def format_batch(self, batch: Dict) -> Dict:
                 extra_frames = dur.sum() - mel_lengths[idx]
                 largest_idxs = torch.argsort(-dur)[:extra_frames]
                 dur[largest_idxs] -= 1
-                assert (
-                    dur.sum() == mel_lengths[idx]
-                ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
+                assert dur.sum() == mel_lengths[idx], (
+                    f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
+                )
                 durations[idx, : text_lengths[idx]] = dur
 
         # set stop targets wrt reduction factor
@@ -283,12 +284,12 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
     def get_data_loader(
         self,
         config: Coqpit,
-        assets: Dict,
+        assets: dict,
         is_eval: bool,
-        samples: Union[List[Dict], List[List]],
+        samples: list[dict] | list[list],
         verbose: bool,
         num_gpus: int,
-        rank: int = None,
+        rank: int | None = None,
     ) -> "DataLoader":
         if is_eval and not config.run_eval:
             loader = None
@@ -364,7 +365,7 @@ def get_data_loader(
 
     def _get_test_aux_input(
         self,
-    ) -> Dict:
+    ) -> dict:
         d_vector = None
         if self.config.use_d_vector_file:
             d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings]
@@ -381,7 +382,7 @@ def _get_test_aux_input(
         }
         return aux_inputs
 
-    def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
+    def test_run(self, assets: dict) -> tuple[dict, dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
         You can override this for a different behaviour.
@@ -412,13 +413,11 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
                 use_griffin_lim=True,
                 do_trim_silence=False,
             )
-            test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
-            test_figures["{}-prediction".format(idx)] = plot_spectrogram(
+            test_audios[f"{idx}-audio"] = outputs_dict["wav"]
+            test_figures[f"{idx}-prediction"] = plot_spectrogram(
                 outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
             )
-            test_figures["{}-alignment".format(idx)] = plot_alignment(
-                outputs_dict["outputs"]["alignments"], output_fig=False
-            )
+            test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False)
         return test_figures, test_audios
 
     def on_init_start(self, trainer):
diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py
index c6f15a7952..2d59db74c0 100644
--- a/TTS/tts/models/delightful_tts.py
+++ b/TTS/tts/models/delightful_tts.py
@@ -3,35 +3,40 @@
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 import torch.distributed as dist
-import torchaudio
 from coqpit import Coqpit
-from librosa.filters import mel as librosa_mel_fn
 from torch import nn
-from torch.nn import functional as F
 from torch.utils.data import DataLoader
 from torch.utils.data.sampler import WeightedRandomSampler
 from trainer.io import load_fsspec
 from trainer.torch import DistributedSampler, DistributedSamplerWrapper
 from trainer.trainer_utils import get_optimizer, get_scheduler
 
-from TTS.tts.datasets.dataset import F0Dataset, TTSDataset, _parse_sample
+from TTS.tts.datasets.dataset import F0Dataset, TTSDataset, _parse_sample, get_attribute_balancer_weights
 from TTS.tts.layers.delightful_tts.acoustic_model import AcousticModel
-from TTS.tts.layers.losses import ForwardSumLoss, VitsDiscriminatorLoss
+from TTS.tts.layers.losses import (
+    ForwardSumLoss,
+    VitsDiscriminatorLoss,
+    _binary_alignment_loss,
+    feature_loss,
+    generator_loss,
+)
 from TTS.tts.layers.vits.discriminator import VitsDiscriminator
 from TTS.tts.models.base_tts import BaseTTSE2E
+from TTS.tts.models.vits import load_audio
 from TTS.tts.utils.helpers import average_over_durations, compute_attn_prior, rand_segments, segment, sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.synthesis import embedding_to_torch, id_to_torch, numpy_to_torch
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_pitch, plot_spectrogram
 from TTS.utils.audio.numpy_transforms import build_mel_basis, compute_f0
 from TTS.utils.audio.numpy_transforms import db_to_amp as db_to_amp_numpy
 from TTS.utils.audio.numpy_transforms import mel_to_wav as mel_to_wav_numpy
 from TTS.utils.audio.processor import AudioProcessor
+from TTS.utils.audio.torch_transforms import wav_to_mel, wav_to_spec
 from TTS.vocoder.layers.losses import MultiScaleSTFTLoss
 from TTS.vocoder.models.hifigan_generator import HifiganGenerator
 from TTS.vocoder.utils.generic_utils import plot_results
@@ -39,291 +44,27 @@
 logger = logging.getLogger(__name__)
 
 
-def id_to_torch(aux_id, cuda=False):
-    if aux_id is not None:
-        aux_id = np.asarray(aux_id)
-        aux_id = torch.from_numpy(aux_id)
-    if cuda:
-        return aux_id.cuda()
-    return aux_id
-
-
-def embedding_to_torch(d_vector, cuda=False):
-    if d_vector is not None:
-        d_vector = np.asarray(d_vector)
-        d_vector = torch.from_numpy(d_vector).float()
-        d_vector = d_vector.squeeze().unsqueeze(0)
-    if cuda:
-        return d_vector.cuda()
-    return d_vector
-
-
-def numpy_to_torch(np_array, dtype, cuda=False):
-    if np_array is None:
-        return None
-    tensor = torch.as_tensor(np_array, dtype=dtype)
-    if cuda:
-        return tensor.cuda()
-    return tensor
-
-
-def get_mask_from_lengths(lengths: torch.Tensor) -> torch.Tensor:
-    batch_size = lengths.shape[0]
-    max_len = torch.max(lengths).item()
-    ids = torch.arange(0, max_len, device=lengths.device).unsqueeze(0).expand(batch_size, -1)
-    mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
-    return mask
-
-
-def pad(input_ele: List[torch.Tensor], max_len: int) -> torch.Tensor:
-    out_list = torch.jit.annotate(List[torch.Tensor], [])
-    for batch in input_ele:
-        if len(batch.shape) == 1:
-            one_batch_padded = F.pad(batch, (0, max_len - batch.size(0)), "constant", 0.0)
-        else:
-            one_batch_padded = F.pad(batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0)
-        out_list.append(one_batch_padded)
-    out_padded = torch.stack(out_list)
-    return out_padded
-
-
-def stride_lens(lens: torch.Tensor, stride: int = 2) -> torch.Tensor:
-    return torch.ceil(lens / stride).int()
-
-
-def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor:
-    assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..."
-    return torch.randn(shape) * np.sqrt(2 / shape[1])
-
-
-# pylint: disable=redefined-outer-name
-def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
-    pad = kernel_size // 2
-    return (pad, pad - (kernel_size + 1) % 2)
-
-
 hann_window = {}
 mel_basis = {}
 
 
-@torch.no_grad()
-def weights_reset(m: nn.Module):
-    # check if the current module has reset_parameters and if it is reset the weight
-    reset_parameters = getattr(m, "reset_parameters", None)
-    if callable(reset_parameters):
-        m.reset_parameters()
-
-
-def get_module_weights_sum(mdl: nn.Module):
-    dict_sums = {}
-    for name, w in mdl.named_parameters():
-        if "weight" in name:
-            value = w.data.sum().item()
-            dict_sums[name] = value
-    return dict_sums
-
-
-def load_audio(file_path: str):
-    """Load the audio file normalized in [-1, 1]
-
-    Return Shapes:
-        - x: :math:`[1, T]`
-    """
-    x, sr = torchaudio.load(
-        file_path,
-    )
-    assert (x > 1).sum() + (x < -1).sum() == 0
-    return x, sr
-
-
-def _amp_to_db(x, C=1, clip_val=1e-5):
-    return torch.log(torch.clamp(x, min=clip_val) * C)
-
-
-def _db_to_amp(x, C=1):
-    return torch.exp(x) / C
-
-
-def amp_to_db(magnitudes):
-    output = _amp_to_db(magnitudes)
-    return output
-
-
-def db_to_amp(magnitudes):
-    output = _db_to_amp(magnitudes)
-    return output
-
-
-def _wav_to_spec(y, n_fft, hop_length, win_length, center=False):
-    y = y.squeeze(1)
-
-    if torch.min(y) < -1.0:
-        logger.info("min value is %.3f", torch.min(y))
-    if torch.max(y) > 1.0:
-        logger.info("max value is %.3f", torch.max(y))
-
-    global hann_window  # pylint: disable=global-statement
-    dtype_device = str(y.dtype) + "_" + str(y.device)
-    wnsize_dtype_device = str(win_length) + "_" + dtype_device
-    if wnsize_dtype_device not in hann_window:
-        hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device)
-
-    y = torch.nn.functional.pad(
-        y.unsqueeze(1),
-        (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
-        mode="reflect",
-    )
-    y = y.squeeze(1)
-
-    spec = torch.view_as_real(
-        torch.stft(
-            y,
-            n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            window=hann_window[wnsize_dtype_device],
-            center=center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-    )
-
-    return spec
-
-
-def wav_to_spec(y, n_fft, hop_length, win_length, center=False):
-    """
-    Args Shapes:
-        - y : :math:`[B, 1, T]`
-
-    Return Shapes:
-        - spec : :math:`[B,C,T]`
-    """
-    spec = _wav_to_spec(y, n_fft, hop_length, win_length, center=center)
-    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
-    return spec
-
-
 def wav_to_energy(y, n_fft, hop_length, win_length, center=False):
-    spec = _wav_to_spec(y, n_fft, hop_length, win_length, center=center)
-
-    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    spec = wav_to_spec(y, n_fft, hop_length, win_length, center=center)
     return torch.norm(spec, dim=1, keepdim=True)
 
 
-def name_mel_basis(spec, n_fft, fmax):
-    n_fft_len = f"{n_fft}_{fmax}_{spec.dtype}_{spec.device}"
-    return n_fft_len
-
-
-def spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax):
-    """
-    Args Shapes:
-        - spec : :math:`[B,C,T]`
-
-    Return Shapes:
-        - mel : :math:`[B,C,T]`
-    """
-    global mel_basis  # pylint: disable=global-statement
-    mel_basis_key = name_mel_basis(spec, n_fft, fmax)
-    # pylint: disable=too-many-function-args
-    if mel_basis_key not in mel_basis:
-        # pylint: disable=missing-kwoa
-        mel = librosa_mel_fn(sample_rate, n_fft, num_mels, fmin, fmax)
-        mel_basis[mel_basis_key] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
-    mel = torch.matmul(mel_basis[mel_basis_key], spec)
-    mel = amp_to_db(mel)
-    return mel
-
-
-def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False):
-    """
-    Args Shapes:
-        - y : :math:`[B, 1, T_y]`
-
-    Return Shapes:
-        - spec : :math:`[B,C,T_spec]`
-    """
-    y = y.squeeze(1)
-
-    if torch.min(y) < -1.0:
-        logger.info("min value is %.3f", torch.min(y))
-    if torch.max(y) > 1.0:
-        logger.info("max value is %.3f", torch.max(y))
-
-    global mel_basis, hann_window  # pylint: disable=global-statement
-    mel_basis_key = name_mel_basis(y, n_fft, fmax)
-    wnsize_dtype_device = str(win_length) + "_" + str(y.dtype) + "_" + str(y.device)
-    if mel_basis_key not in mel_basis:
-        # pylint: disable=missing-kwoa
-        mel = librosa_mel_fn(
-            sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
-        )  # pylint: disable=too-many-function-args
-        mel_basis[mel_basis_key] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
-    if wnsize_dtype_device not in hann_window:
-        hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device)
-
-    y = torch.nn.functional.pad(
-        y.unsqueeze(1),
-        (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
-        mode="reflect",
-    )
-    y = y.squeeze(1)
-
-    spec = torch.view_as_real(
-        torch.stft(
-            y,
-            n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            window=hann_window[wnsize_dtype_device],
-            center=center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-    )
-
-    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
-    spec = torch.matmul(mel_basis[mel_basis_key], spec)
-    spec = amp_to_db(spec)
-    return spec
-
-
 ##############################
 # DATASET
 ##############################
 
 
-def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: dict = None):
-    """Create balancer weight for torch WeightedSampler"""
-    attr_names_samples = np.array([item[attr_name] for item in items])
-    unique_attr_names = np.unique(attr_names_samples).tolist()
-    attr_idx = [unique_attr_names.index(l) for l in attr_names_samples]
-    attr_count = np.array([len(np.where(attr_names_samples == l)[0]) for l in unique_attr_names])
-    weight_attr = 1.0 / attr_count
-    dataset_samples_weight = np.array([weight_attr[l] for l in attr_idx])
-    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
-    if multi_dict is not None:
-        multiplier_samples = np.array([multi_dict.get(item[attr_name], 1.0) for item in items])
-        dataset_samples_weight *= multiplier_samples
-    return (
-        torch.from_numpy(dataset_samples_weight).float(),
-        unique_attr_names,
-        np.unique(dataset_samples_weight).tolist(),
-    )
-
-
 class ForwardTTSE2eF0Dataset(F0Dataset):
     """Override F0Dataset to avoid slow computing of pitches"""
 
     def __init__(
         self,
         ap,
-        samples: Union[List[List], List[Dict]],
+        samples: list[list] | list[dict],
         cache_path: str = None,
         precompute_num_workers=0,
         normalize_f0=True,
@@ -533,15 +274,15 @@ def collate_fn(self, batch):
 @dataclass
 class VocoderConfig(Coqpit):
     resblock_type_decoder: str = "1"
-    resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
-    resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
-    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
+    resblock_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes_decoder: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates_decoder: list[int] = field(default_factory=lambda: [8, 8, 2, 2])
     upsample_initial_channel_decoder: int = 512
-    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    upsample_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [16, 16, 4, 4])
     use_spectral_norm_discriminator: bool = False
-    upsampling_rates_discriminator: List[int] = field(default_factory=lambda: [4, 4, 4, 4])
-    periods_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
-    pretrained_model_path: Optional[str] = None
+    upsampling_rates_discriminator: list[int] = field(default_factory=lambda: [4, 4, 4, 4])
+    periods_discriminator: list[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
+    pretrained_model_path: str | None = None
 
 
 @dataclass
@@ -696,10 +437,6 @@ def __init__(
                 periods=self.config.vocoder.periods_discriminator,
             )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     @property
     def energy_scaler(self):
         return self.acoustic_model.energy_scaler
@@ -815,7 +552,7 @@ def forward(
         attn_priors: torch.FloatTensor = None,
         d_vectors: torch.FloatTensor = None,
         speaker_idx: torch.LongTensor = None,
-    ) -> Dict:
+    ) -> dict:
         """Model's forward pass.
 
         Args:
@@ -880,7 +617,7 @@ def forward(
         model_outputs["slice_ids"] = slice_ids
         return model_outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self, x, aux_input={"d_vectors": None, "speaker_ids": None}, pitch_transform=None, energy_transform=None
     ):
@@ -904,7 +641,7 @@ def inference(
         model_outputs["model_outputs"] = vocoder_output
         return model_outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference_spec_decoder(self, x, aux_input={"d_vectors": None, "speaker_ids": None}):
         encoder_outputs = self.acoustic_model.inference(
             tokens=x,
@@ -1094,9 +831,7 @@ def _log(self, batch, outputs, name_prefix="train"):
         audios[f"{name_prefix}/vocoder_audio"] = sample_voice
         return figures, audios
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ):  # pylint: disable=no-self-use, unused-argument
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int):  # pylint: disable=no-self-use, unused-argument
         """Create visualizations and waveform examples.
 
         For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
@@ -1196,7 +931,7 @@ def synthesize(
         **kwargs,
     ):  # pylint: disable=unused-argument
         # TODO: add cloning support with ref_waveform
-        is_cuda = next(self.parameters()).is_cuda
+        device = next(self.parameters()).device
 
         # convert text to sequence of token IDs
         text_inputs = np.asarray(
@@ -1210,14 +945,14 @@ def synthesize(
             if isinstance(speaker_id, str) and self.args.use_speaker_embedding:
                 # get the speaker id for the speaker embedding layer
                 _speaker_id = self.speaker_manager.name_to_id[speaker_id]
-                _speaker_id = id_to_torch(_speaker_id, cuda=is_cuda)
+                _speaker_id = id_to_torch(_speaker_id, device=device)
 
         if speaker_id is not None and self.args.use_d_vector_file:
             # get the average d_vector for the speaker
             d_vector = self.speaker_manager.get_mean_embedding(speaker_id, num_samples=None, randomize=False)
-        d_vector = embedding_to_torch(d_vector, cuda=is_cuda)
+        d_vector = embedding_to_torch(d_vector, device=device)
 
-        text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=is_cuda)
+        text_inputs = numpy_to_torch(text_inputs, torch.long, device=device)
         text_inputs = text_inputs.unsqueeze(0)
 
         # synthesize voice
@@ -1240,7 +975,7 @@ def synthesize(
         return return_dict
 
     def synthesize_with_gl(self, text: str, speaker_id, d_vector):
-        is_cuda = next(self.parameters()).is_cuda
+        device = next(self.parameters()).device
 
         # convert text to sequence of token IDs
         text_inputs = np.asarray(
@@ -1249,12 +984,12 @@ def synthesize_with_gl(self, text: str, speaker_id, d_vector):
         )
         # pass tensors to backend
         if speaker_id is not None:
-            speaker_id = id_to_torch(speaker_id, cuda=is_cuda)
+            speaker_id = id_to_torch(speaker_id, device=device)
 
         if d_vector is not None:
-            d_vector = embedding_to_torch(d_vector, cuda=is_cuda)
+            d_vector = embedding_to_torch(d_vector, device=device)
 
-        text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=is_cuda)
+        text_inputs = numpy_to_torch(text_inputs, torch.long, device=device)
         text_inputs = text_inputs.unsqueeze(0)
 
         # synthesize voice
@@ -1276,8 +1011,8 @@ def synthesize_with_gl(self, text: str, speaker_id, d_vector):
         }
         return return_dict
 
-    @torch.no_grad()
-    def test_run(self, assets) -> Tuple[Dict, Dict]:
+    @torch.inference_mode()
+    def test_run(self, assets) -> tuple[dict, dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
         You can override this for a different behaviour.
@@ -1303,18 +1038,22 @@ def test_run(self, assets) -> Tuple[Dict, Dict]:
                 d_vector=aux_inputs["d_vector"],
             )
             # speaker_name = self.speaker_manager.speaker_names[aux_inputs["speaker_id"]]
-            test_audios["{}-audio".format(idx)] = outputs["wav"].T
-            test_audios["{}-audio_encoder".format(idx)] = outputs_gl["wav"].T
-            test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False)
+            test_audios[f"{idx}-audio"] = outputs["wav"].T
+            test_audios[f"{idx}-audio_encoder"] = outputs_gl["wav"].T
+            test_figures[f"{idx}-alignment"] = plot_alignment(outputs["alignments"], output_fig=False)
         return {"figures": test_figures, "audios": test_audios}
 
     def test_log(
-        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         logger.test_audios(steps, outputs["audios"], self.config.audio.sample_rate)
         logger.test_figures(steps, outputs["figures"])
 
-    def format_batch(self, batch: Dict) -> Dict:
+    def format_batch(self, batch: dict) -> dict:
         """Compute speaker, langugage IDs and d_vector for the batch if necessary."""
         speaker_ids = None
         d_vectors = None
@@ -1422,12 +1161,12 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
     def get_data_loader(
         self,
         config: Coqpit,
-        assets: Dict,
+        assets: dict,
         is_eval: bool,
-        samples: Union[List[Dict], List[List]],
+        samples: list[dict] | list[list],
         verbose: bool,
         num_gpus: int,
-        rank: int = None,
+        rank: int | None = None,
     ) -> "DataLoader":
         if is_eval and not config.run_eval:
             loader = None
@@ -1479,7 +1218,7 @@ def get_data_loader(
     def get_criterion(self):
         return [VitsDiscriminatorLoss(self.config), DelightfulTTSLoss(self.config)]
 
-    def get_optimizer(self) -> List:
+    def get_optimizer(self) -> list:
         """Initiate and return the GAN optimizers based on the config parameters.
         It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator.
         Returns:
@@ -1494,7 +1233,7 @@ def get_optimizer(self) -> List:
         )
         return [optimizer_disc, optimizer_gen]
 
-    def get_lr(self) -> List:
+    def get_lr(self) -> list:
         """Set the initial learning rates for each optimizer.
 
         Returns:
@@ -1502,7 +1241,7 @@ def get_lr(self) -> List:
         """
         return [self.config.lr_disc, self.config.lr_gen]
 
-    def get_scheduler(self, optimizer) -> List:
+    def get_scheduler(self, optimizer) -> list:
         """Set the schedulers for each optimizer.
 
         Args:
@@ -1521,9 +1260,7 @@ def on_epoch_end(self, trainer):  # pylint: disable=unused-argument
         self.energy_scaler.eval()
 
     @staticmethod
-    def init_from_config(
-        config: "DelightfulTTSConfig", samples: Union[List[List], List[Dict]] = None
-    ):  # pylint: disable=unused-argument
+    def init_from_config(config: "DelightfulTTSConfig", samples: list[list] | list[dict] = None):  # pylint: disable=unused-argument
         """Initiate model from config
 
         Args:
@@ -1601,36 +1338,6 @@ def __init__(self, config):
         self.gen_loss_alpha = config.gen_loss_alpha
         self.multi_scale_stft_loss_alpha = config.multi_scale_stft_loss_alpha
 
-    @staticmethod
-    def _binary_alignment_loss(alignment_hard, alignment_soft):
-        """Binary loss that forces soft alignments to match the hard alignments as
-        explained in `https://arxiv.org/pdf/2108.10447.pdf`.
-        """
-        log_sum = torch.log(torch.clamp(alignment_soft[alignment_hard == 1], min=1e-12)).sum()
-        return -log_sum / alignment_hard.sum()
-
-    @staticmethod
-    def feature_loss(feats_real, feats_generated):
-        loss = 0
-        for dr, dg in zip(feats_real, feats_generated):
-            for rl, gl in zip(dr, dg):
-                rl = rl.float().detach()
-                gl = gl.float()
-                loss += torch.mean(torch.abs(rl - gl))
-        return loss * 2
-
-    @staticmethod
-    def generator_loss(scores_fake):
-        loss = 0
-        gen_losses = []
-        for dg in scores_fake:
-            dg = dg.float()
-            l = torch.mean((1 - dg) ** 2)
-            gen_losses.append(l)
-            loss += l
-
-        return loss, gen_losses
-
     def forward(
         self,
         mel_output,
@@ -1728,7 +1435,7 @@ def forward(
         )
 
         if self.binary_alignment_loss_alpha > 0 and aligner_hard is not None:
-            binary_alignment_loss = self._binary_alignment_loss(aligner_hard, aligner_soft)
+            binary_alignment_loss = _binary_alignment_loss(aligner_hard, aligner_soft)
             total_loss = total_loss + self.binary_alignment_loss_alpha * binary_alignment_loss * binary_loss_weight
             if binary_loss_weight:
                 loss_dict["loss_binary_alignment"] = (
@@ -1748,8 +1455,8 @@ def forward(
 
         # vocoder losses
         if not skip_disc:
-            loss_feat = self.feature_loss(feats_real=feats_real, feats_generated=feats_fake) * self.feat_loss_alpha
-            loss_gen = self.generator_loss(scores_fake=scores_fake)[0] * self.gen_loss_alpha
+            loss_feat = feature_loss(feats_real=feats_real, feats_generated=feats_fake) * self.feat_loss_alpha
+            loss_gen = generator_loss(scores_fake=scores_fake)[0] * self.gen_loss_alpha
             loss_dict["vocoder_loss_feat"] = loss_feat
             loss_dict["vocoder_loss_gen"] = loss_gen
             loss_dict["loss"] = loss_dict["loss"] + loss_feat + loss_gen
diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py
index d449e580da..497ac3f63a 100644
--- a/TTS/tts/models/forward_tts.py
+++ b/TTS/tts/models/forward_tts.py
@@ -1,6 +1,5 @@
 import logging
 from dataclasses import dataclass, field
-from typing import Dict, List, Tuple, Union
 
 import torch
 from coqpit import Coqpit
@@ -14,7 +13,7 @@
 from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
 from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
 from TTS.tts.models.base_tts import BaseTTS
-from TTS.tts.utils.helpers import average_over_durations, generate_path, sequence_mask
+from TTS.tts.utils.helpers import average_over_durations, expand_encoder_outputs, generate_attention, sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_avg_energy, plot_avg_pitch, plot_spectrogram
@@ -310,49 +309,6 @@ def init_multispeaker(self, config: Coqpit):
             self.emb_g = nn.Embedding(self.num_speakers, self.args.hidden_channels)
             nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
 
-    @staticmethod
-    def generate_attn(dr, x_mask, y_mask=None):
-        """Generate an attention mask from the durations.
-
-        Shapes
-           - dr: :math:`(B, T_{en})`
-           - x_mask: :math:`(B, T_{en})`
-           - y_mask: :math:`(B, T_{de})`
-        """
-        # compute decode mask from the durations
-        if y_mask is None:
-            y_lengths = dr.sum(1).long()
-            y_lengths[y_lengths < 1] = 1
-            y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype)
-        attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
-        attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
-        return attn
-
-    def expand_encoder_outputs(self, en, dr, x_mask, y_mask):
-        """Generate attention alignment map from durations and
-        expand encoder outputs
-
-        Shapes:
-            - en: :math:`(B, D_{en}, T_{en})`
-            - dr: :math:`(B, T_{en})`
-            - x_mask: :math:`(B, T_{en})`
-            - y_mask: :math:`(B, T_{de})`
-
-        Examples::
-
-            encoder output: [a,b,c,d]
-            durations: [1, 3, 2, 1]
-
-            expanded: [a, b, b, b, c, c, d]
-            attention map: [[0, 0, 0, 0, 0, 0, 1],
-                            [0, 0, 0, 0, 1, 1, 0],
-                            [0, 1, 1, 1, 0, 0, 0],
-                            [1, 0, 0, 0, 0, 0, 0]]
-        """
-        attn = self.generate_attn(dr, x_mask, y_mask)
-        o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2).to(en.dtype), en.transpose(1, 2)).transpose(1, 2)
-        return o_en_ex, attn
-
     def format_durations(self, o_dr_log, x_mask):
         """Format predicted durations.
         1. Convert to linear scale from log scale
@@ -376,7 +332,7 @@ def format_durations(self, o_dr_log, x_mask):
 
     def _forward_encoder(
         self, x: torch.LongTensor, x_mask: torch.FloatTensor, g: torch.FloatTensor = None
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Encoding forward pass.
 
         1. Embed speaker IDs if multi-speaker mode.
@@ -424,7 +380,7 @@ def _forward_decoder(
         x_mask: torch.FloatTensor,
         y_lengths: torch.IntTensor,
         g: torch.FloatTensor,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
         """Decoding forward pass.
 
         1. Compute the decoder output mask
@@ -443,9 +399,8 @@ def _forward_decoder(
         Returns:
             Tuple[torch.FloatTensor, torch.FloatTensor]: Decoder output, attention map from durations.
         """
-        y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype)
         # expand o_en with durations
-        o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask)
+        o_en_ex, attn, y_mask = expand_encoder_outputs(o_en, dr, x_mask, y_lengths)
         # positional encoding
         if hasattr(self, "pos_encoder"):
             o_en_ex = self.pos_encoder(o_en_ex, y_mask)
@@ -459,7 +414,7 @@ def _forward_pitch_predictor(
         x_mask: torch.IntTensor,
         pitch: torch.FloatTensor = None,
         dr: torch.IntTensor = None,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
         """Pitch predictor forward pass.
 
         1. Predict pitch from encoder outputs.
@@ -495,7 +450,7 @@ def _forward_energy_predictor(
         x_mask: torch.IntTensor,
         energy: torch.FloatTensor = None,
         dr: torch.IntTensor = None,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
         """Energy predictor forward pass.
 
         1. Predict energy from encoder outputs.
@@ -527,7 +482,7 @@ def _forward_energy_predictor(
 
     def _forward_aligner(
         self, x: torch.FloatTensor, y: torch.FloatTensor, x_mask: torch.IntTensor, y_mask: torch.IntTensor
-    ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Aligner forward pass.
 
         1. Compute a mask to apply to the attention map.
@@ -566,7 +521,7 @@ def _forward_aligner(
         alignment_soft = alignment_soft.squeeze(1).transpose(1, 2)
         return o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas
 
-    def _set_speaker_input(self, aux_input: Dict):
+    def _set_speaker_input(self, aux_input: dict):
         d_vectors = aux_input.get("d_vectors", None)
         speaker_ids = aux_input.get("speaker_ids", None)
 
@@ -588,8 +543,8 @@ def forward(
         dr: torch.IntTensor = None,
         pitch: torch.FloatTensor = None,
         energy: torch.FloatTensor = None,
-        aux_input: Dict = {"d_vectors": None, "speaker_ids": None},  # pylint: disable=unused-argument
-    ) -> Dict:
+        aux_input: dict = {"d_vectors": None, "speaker_ids": None},  # pylint: disable=unused-argument
+    ) -> dict:
         """Model's forward pass.
 
         Args:
@@ -624,7 +579,7 @@ def forward(
             o_dr_log = self.duration_predictor(o_en, x_mask)
         o_dr = torch.clamp(torch.exp(o_dr_log) - 1, 0, self.max_duration)
         # generate attn mask from predicted durations
-        o_attn = self.generate_attn(o_dr.squeeze(1), x_mask)
+        o_attn = generate_attention(o_dr.squeeze(1), x_mask)
         # aligner
         o_alignment_dur = None
         alignment_soft = None
@@ -672,7 +627,7 @@ def forward(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}):  # pylint: disable=unused-argument
         """Model's inference pass.
 
@@ -815,9 +770,7 @@ def _create_logs(self, batch, outputs, ap):
         train_audio = ap.inv_melspectrogram(pred_spec.T)
         return figures, {"audio": train_audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ) -> None:  # pylint: disable=no-self-use
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:  # pylint: disable=no-self-use
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
@@ -830,9 +783,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
@@ -849,7 +800,7 @@ def on_train_step_start(self, trainer):
         self.binary_loss_weight = min(trainer.epochs_done / self.config.binary_loss_warmup_epochs, 1.0) * 1.0
 
     @staticmethod
-    def init_from_config(config: "ForwardTTSConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "ForwardTTSConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py
index 5bf4713140..5d03b53dc6 100644
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@@ -1,6 +1,5 @@
 import logging
 import math
-from typing import Dict, List, Tuple, Union
 
 import torch
 from coqpit import Coqpit
@@ -125,9 +124,9 @@ def init_multispeaker(self, config: Coqpit):
                 config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512
             )
             if self.speaker_manager is not None:
-                assert (
-                    config.d_vector_dim == self.speaker_manager.embedding_dim
-                ), " [!] d-vector dimension mismatch b/w config and speaker manager."
+                assert config.d_vector_dim == self.speaker_manager.embedding_dim, (
+                    " [!] d-vector dimension mismatch b/w config and speaker manager."
+                )
         # init speaker embedding layer
         if config.use_speaker_embedding and not config.use_d_vector_file:
             logger.info("Init speaker_embedding layer.")
@@ -162,7 +161,7 @@ def lock_act_norm_layers(self):
             if getattr(f, "set_ddi", False):
                 f.set_ddi(False)
 
-    def _set_speaker_input(self, aux_input: Dict):
+    def _set_speaker_input(self, aux_input: dict):
         if aux_input is None:
             d_vectors = None
             speaker_ids = None
@@ -179,7 +178,7 @@ def _set_speaker_input(self, aux_input: Dict):
         g = speaker_ids if speaker_ids is not None else d_vectors
         return g
 
-    def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]:
+    def _speaker_embedding(self, aux_input: dict) -> torch.Tensor | None:
         g = self._set_speaker_input(aux_input)
         # speaker embedding
         if g is not None:
@@ -193,9 +192,7 @@ def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]:
                 g = F.normalize(g).unsqueeze(-1)  # [b, h, 1]
         return g
 
-    def forward(
-        self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
-    ):  # pylint: disable=dangerous-default-value
+    def forward(self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}):  # pylint: disable=dangerous-default-value
         """
         Args:
             x (torch.Tensor):
@@ -262,7 +259,7 @@ def forward(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference_with_MAS(
         self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
     ):  # pylint: disable=dangerous-default-value
@@ -318,10 +315,8 @@ def inference_with_MAS(
         }
         return outputs
 
-    @torch.no_grad()
-    def decoder_inference(
-        self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
-    ):  # pylint: disable=dangerous-default-value
+    @torch.inference_mode()
+    def decoder_inference(self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}):  # pylint: disable=dangerous-default-value
         """
         Shapes:
             - y: :math:`[B, T, C]`
@@ -341,10 +336,8 @@ def decoder_inference(
         outputs["logdet"] = logdet
         return outputs
 
-    @torch.no_grad()
-    def inference(
-        self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}
-    ):  # pylint: disable=dangerous-default-value
+    @torch.inference_mode()
+    def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}):  # pylint: disable=dangerous-default-value
         x_lengths = aux_input["x_lengths"]
         g = self._speaker_embedding(aux_input)
         # embedding pass
@@ -457,14 +450,12 @@ def _create_logs(self, batch, outputs, ap):
         train_audio = ap.inv_melspectrogram(pred_spec.T)
         return figures, {"audio": train_audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ) -> None:  # pylint: disable=no-self-use
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:  # pylint: disable=no-self-use
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
@@ -473,8 +464,8 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
-    @torch.no_grad()
-    def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
+    @torch.inference_mode()
+    def test_run(self, assets: dict) -> tuple[dict, dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
         You can override this for a different behaviour.
@@ -503,11 +494,11 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
                     do_trim_silence=False,
                 )
 
-                test_audios["{}-audio".format(idx)] = outputs["wav"]
-                test_figures["{}-prediction".format(idx)] = plot_spectrogram(
+                test_audios[f"{idx}-audio"] = outputs["wav"]
+                test_figures[f"{idx}-prediction"] = plot_spectrogram(
                     outputs["outputs"]["model_outputs"], self.ap, output_fig=False
                 )
-                test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False)
+                test_figures[f"{idx}-alignment"] = plot_alignment(outputs["alignments"], output_fig=False)
         return test_figures, test_audios
 
     def preprocess(self, y, y_lengths, y_max_length, attn=None):
@@ -522,9 +513,7 @@ def preprocess(self, y, y_lengths, y_max_length, attn=None):
     def store_inverse(self):
         self.decoder.store_inverse()
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
         self.load_state_dict(state["model"])
         if eval:
@@ -543,7 +532,7 @@ def on_train_step_start(self, trainer):
         self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps
 
     @staticmethod
-    def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "GlowTTSConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py
index de5401aac7..2cbf425884 100644
--- a/TTS/tts/models/neuralhmm_tts.py
+++ b/TTS/tts/models/neuralhmm_tts.py
@@ -1,6 +1,5 @@
 import logging
 import os
-from typing import Dict, List, Union
 
 import torch
 from coqpit import Coqpit
@@ -8,6 +7,7 @@
 from trainer.io import load_fsspec
 from trainer.logging.tensorboard_logger import TensorboardLogger
 
+from TTS.tts.layers.losses import NLLLoss
 from TTS.tts.layers.overflow.common_layers import Encoder, OverflowUtils
 from TTS.tts.layers.overflow.neural_hmm import NeuralHMM
 from TTS.tts.layers.overflow.plotting_utils import (
@@ -101,7 +101,7 @@ def __init__(
         self.register_buffer("mean", torch.tensor(0))
         self.register_buffer("std", torch.tensor(1))
 
-    def update_mean_std(self, statistics_dict: Dict):
+    def update_mean_std(self, statistics_dict: dict):
         self.mean.data = torch.tensor(statistics_dict["mean"])
         self.std.data = torch.tensor(statistics_dict["std"])
 
@@ -173,10 +173,10 @@ def train_step(self, batch: dict, criterion: nn.Module):
         loss_dict.update(self._training_stats(batch))
         return outputs, loss_dict
 
-    def eval_step(self, batch: Dict, criterion: nn.Module):
+    def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
-    def _format_aux_input(self, aux_input: Dict, default_input_dict):
+    def _format_aux_input(self, aux_input: dict, default_input_dict):
         """Set missing fields to their default value.
 
         Args:
@@ -194,7 +194,7 @@ def _format_aux_input(self, aux_input: Dict, default_input_dict):
             return format_aux_input(default_input_dict, aux_input)
         return default_input_dict
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         text: torch.Tensor,
@@ -238,7 +238,7 @@ def get_criterion():
         return NLLLoss()
 
     @staticmethod
-    def init_from_config(config: "NeuralhmmTTSConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "NeuralhmmTTSConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
@@ -345,17 +345,13 @@ def _create_logs(self, batch, outputs, ap):  # pylint: disable=no-self-use, unus
         audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy())
         return figures, {"audios": audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ):  # pylint: disable=unused-argument
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int):  # pylint: disable=unused-argument
         """Log training progress."""
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
 
-    def eval_log(
-        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int
-    ):  # pylint: disable=unused-argument
+    def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int):  # pylint: disable=unused-argument
         """Compute and log evaluation metrics."""
         # Plot model parameters histograms
         if isinstance(logger, TensorboardLogger):
@@ -369,25 +365,11 @@ def eval_log(
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     def test_log(
-        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         logger.test_audios(steps, outputs[1], self.ap.sample_rate)
         logger.test_figures(steps, outputs[0])
-
-
-class NLLLoss(nn.Module):
-    """Negative log likelihood loss."""
-
-    def forward(self, log_prob: torch.Tensor) -> dict:  # pylint: disable=no-self-use
-        """Compute the loss.
-
-        Args:
-            logits (Tensor): [B, T, D]
-
-        Returns:
-            Tensor: [1]
-
-        """
-        return_dict = {}
-        return_dict["loss"] = -log_prob.mean()
-        return return_dict
diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py
index b72f4877cf..aad2e1f553 100644
--- a/TTS/tts/models/overflow.py
+++ b/TTS/tts/models/overflow.py
@@ -1,6 +1,5 @@
 import logging
 import os
-from typing import Dict, List, Union
 
 import torch
 from coqpit import Coqpit
@@ -8,6 +7,7 @@
 from trainer.io import load_fsspec
 from trainer.logging.tensorboard_logger import TensorboardLogger
 
+from TTS.tts.layers.losses import NLLLoss
 from TTS.tts.layers.overflow.common_layers import Encoder, OverflowUtils
 from TTS.tts.layers.overflow.decoder import Decoder
 from TTS.tts.layers.overflow.neural_hmm import NeuralHMM
@@ -32,32 +32,33 @@ class Overflow(BaseTTS):
 
     Paper abstract::
         Neural HMMs are a type of neural transducer recently proposed for
-    sequence-to-sequence modelling in text-to-speech. They combine the best features
-    of classic statistical speech synthesis and modern neural TTS, requiring less
-    data and fewer training updates, and are less prone to gibberish output caused
-    by neural attention failures. In this paper, we combine neural HMM TTS with
-    normalising flows for describing the highly non-Gaussian distribution of speech
-    acoustics. The result is a powerful, fully probabilistic model of durations and
-    acoustics that can be trained using exact maximum likelihood. Compared to
-    dominant flow-based acoustic models, our approach integrates autoregression for
-    improved modelling of long-range dependences such as utterance-level prosody.
-    Experiments show that a system based on our proposal gives more accurate
-    pronunciations and better subjective speech quality than comparable methods,
-    whilst retaining the original advantages of neural HMMs. Audio examples and code
-    are available at https://shivammehta25.github.io/OverFlow/.
+        sequence-to-sequence modelling in text-to-speech. They combine the best features
+        of classic statistical speech synthesis and modern neural TTS, requiring less
+        data and fewer training updates, and are less prone to gibberish output caused
+        by neural attention failures. In this paper, we combine neural HMM TTS with
+        normalising flows for describing the highly non-Gaussian distribution of speech
+        acoustics. The result is a powerful, fully probabilistic model of durations and
+        acoustics that can be trained using exact maximum likelihood. Compared to
+        dominant flow-based acoustic models, our approach integrates autoregression for
+        improved modelling of long-range dependences such as utterance-level prosody.
+        Experiments show that a system based on our proposal gives more accurate
+        pronunciations and better subjective speech quality than comparable methods,
+        whilst retaining the original advantages of neural HMMs. Audio examples and code
+        are available at https://shivammehta25.github.io/OverFlow/.
 
     Note:
-        - Neural HMMs uses flat start initialization i.e it computes the means and std and transition probabilities
-        of the dataset and uses them to initialize the model. This benefits the model and helps with faster learning
-        If you change the dataset or want to regenerate the parameters change the `force_generate_statistics` and
-        `mel_statistics_parameter_path` accordingly.
+        - Neural HMMs uses flat start initialization i.e it computes the means
+          and std and transition probabilities of the dataset and uses them to initialize
+          the model. This benefits the model and helps with faster learning If you change
+          the dataset or want to regenerate the parameters change the
+          `force_generate_statistics` and `mel_statistics_parameter_path` accordingly.
 
         - To enable multi-GPU training, set the `use_grad_checkpointing=False` in config.
-        This will significantly increase the memory usage.  This is because to compute
-        the actual data likelihood (not an approximation using MAS/Viterbi) we must use
-        all the states at the previous time step during the forward pass to decide the
-        probability distribution at the current step i.e the difference between the forward
-        algorithm and viterbi approximation.
+          This will significantly increase the memory usage.  This is because to compute
+          the actual data likelihood (not an approximation using MAS/Viterbi) we must use
+          all the states at the previous time step during the forward pass to decide the
+          probability distribution at the current step i.e the difference between the forward
+          algorithm and viterbi approximation.
 
     Check :class:`TTS.tts.configs.overflow.OverFlowConfig` for class arguments.
     """
@@ -114,7 +115,7 @@ def __init__(
         self.register_buffer("mean", torch.tensor(0))
         self.register_buffer("std", torch.tensor(1))
 
-    def update_mean_std(self, statistics_dict: Dict):
+    def update_mean_std(self, statistics_dict: dict):
         self.mean.data = torch.tensor(statistics_dict["mean"])
         self.std.data = torch.tensor(statistics_dict["std"])
 
@@ -186,10 +187,10 @@ def train_step(self, batch: dict, criterion: nn.Module):
         loss_dict.update(self._training_stats(batch))
         return outputs, loss_dict
 
-    def eval_step(self, batch: Dict, criterion: nn.Module):
+    def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
-    def _format_aux_input(self, aux_input: Dict, default_input_dict):
+    def _format_aux_input(self, aux_input: dict, default_input_dict):
         """Set missing fields to their default value.
 
         Args:
@@ -207,7 +208,7 @@ def _format_aux_input(self, aux_input: Dict, default_input_dict):
             return format_aux_input(default_input_dict, aux_input)
         return default_input_dict
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         text: torch.Tensor,
@@ -253,7 +254,7 @@ def get_criterion():
         return NLLLoss()
 
     @staticmethod
-    def init_from_config(config: "OverFlowConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "OverFlowConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
@@ -361,17 +362,13 @@ def _create_logs(self, batch, outputs, ap):  # pylint: disable=no-self-use, unus
         audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy())
         return figures, {"audios": audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ):  # pylint: disable=unused-argument
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int):  # pylint: disable=unused-argument
         """Log training progress."""
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
 
-    def eval_log(
-        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int
-    ):  # pylint: disable=unused-argument
+    def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int):  # pylint: disable=unused-argument
         """Compute and log evaluation metrics."""
         # Plot model parameters histograms
         if isinstance(logger, TensorboardLogger):
@@ -385,25 +382,11 @@ def eval_log(
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     def test_log(
-        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         logger.test_audios(steps, outputs[1], self.ap.sample_rate)
         logger.test_figures(steps, outputs[0])
-
-
-class NLLLoss(nn.Module):
-    """Negative log likelihood loss."""
-
-    def forward(self, log_prob: torch.Tensor) -> dict:  # pylint: disable=no-self-use
-        """Compute the loss.
-
-        Args:
-            logits (Tensor): [B, T, D]
-
-        Returns:
-            Tensor: [1]
-
-        """
-        return_dict = {}
-        return_dict["loss"] = -log_prob.mean()
-        return return_dict
diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py
index 5d3efd2021..59173691f7 100644
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@@ -1,7 +1,3 @@
-# coding: utf-8
-
-from typing import Dict, List, Tuple, Union
-
 import torch
 from torch import nn
 from trainer.trainer_utils import get_optimizer, get_scheduler
@@ -218,7 +214,7 @@ def forward(  # pylint: disable=dangerous-default-value
         )
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, text_input, aux_input=None):
         aux_input = self._format_aux_input(aux_input)
         inputs = self.embedding(text_input)
@@ -280,7 +276,7 @@ def before_backward_pass(self, loss_dict, optimizer) -> None:
             loss_dict["capacitron_vae_beta_loss"].backward()
             optimizer.first_step()
 
-    def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dict]:
+    def train_step(self, batch: dict, criterion: torch.nn.Module) -> tuple[dict, dict]:
         """Perform a single training step by fetching the right set of samples from the batch.
 
         Args:
@@ -332,7 +328,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dic
         loss_dict["align_error"] = align_error
         return outputs, loss_dict
 
-    def get_optimizer(self) -> List:
+    def get_optimizer(self) -> list:
         if self.use_capacitron_vae:
             return CapacitronOptimizer(self.config, self.named_parameters())
         return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
@@ -380,9 +376,7 @@ def _create_logs(self, batch, outputs, ap):
         audio = ap.inv_spectrogram(pred_linear_spec.T)
         return figures, {"audio": audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ) -> None:  # pylint: disable=no-self-use
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:  # pylint: disable=no-self-use
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
@@ -396,7 +390,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     @staticmethod
-    def init_from_config(config: "TacotronConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "TacotronConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py
index 2716a39786..e924d82d42 100644
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@@ -1,7 +1,3 @@
-# coding: utf-8
-
-from typing import Dict, List, Union
-
 import torch
 from torch import nn
 from trainer.trainer_utils import get_optimizer, get_scheduler
@@ -238,7 +234,7 @@ def forward(  # pylint: disable=dangerous-default-value
         )
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, text, aux_input=None):
         """Forward pass for inference with no Teacher-Forcing.
 
@@ -309,7 +305,7 @@ def before_backward_pass(self, loss_dict, optimizer) -> None:
             loss_dict["capacitron_vae_beta_loss"].backward()
             optimizer.first_step()
 
-    def train_step(self, batch: Dict, criterion: torch.nn.Module):
+    def train_step(self, batch: dict, criterion: torch.nn.Module):
         """A single training step. Forward pass and loss computation.
 
         Args:
@@ -360,7 +356,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module):
         loss_dict["align_error"] = align_error
         return outputs, loss_dict
 
-    def get_optimizer(self) -> List:
+    def get_optimizer(self) -> list:
         if self.use_capacitron_vae:
             return CapacitronOptimizer(self.config, self.named_parameters())
         return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
@@ -403,9 +399,7 @@ def _create_logs(self, batch, outputs, ap):
         audio = ap.inv_melspectrogram(pred_spec.T)
         return figures, {"audio": audio}
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ) -> None:  # pylint: disable=no-self-use
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:  # pylint: disable=no-self-use
         """Log training progress."""
         figures, audios = self._create_logs(batch, outputs, self.ap)
         logger.train_figures(steps, figures)
@@ -420,7 +414,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     @staticmethod
-    def init_from_config(config: "Tacotron2Config", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "Tacotron2Config", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py
index 01629b5d2a..a42d577676 100644
--- a/TTS/tts/models/tortoise.py
+++ b/TTS/tts/models/tortoise.py
@@ -342,7 +342,6 @@ def __init__(self, config: Coqpit):
             else self.args.autoregressive_batch_size
         )
         self.enable_redaction = self.args.enable_redaction
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         if self.enable_redaction:
             self.aligner = Wav2VecAlignment()
 
@@ -423,7 +422,9 @@ def get_conditioning_latents(
         Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
         These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
         properties.
-        :param voice_samples: List of arbitrary reference clips, which should be *pairs* of torch tensors containing arbitrary kHz waveform data.
+
+        :param voice_samples: List of arbitrary reference clips, which should be *pairs*
+                              of torch tensors containing arbitrary kHz waveform data.
         :param latent_averaging_mode: 0/1/2 for following modes:
             0 - latents will be generated as in original tortoise, using ~4.27s from each voice sample, averaging latent across all samples
             1 - latents will be generated using (almost) entire voice samples, averaged across all the ~4.27s chunks
@@ -671,7 +672,7 @@ def inference(
                 As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
             diffusion_temperature: (float) Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
                                       are the "mean" prediction of the diffusion network and will sound bland and smeared.
-            hf_generate_kwargs: (**kwargs) The huggingface Transformers generate API is used for the autoregressive transformer.
+            hf_generate_kwargs: (`**kwargs`) The huggingface Transformers generate API is used for the autoregressive transformer.
                                     Extra keyword args fed to this function get forwarded directly to that API. Documentation
                                     here: https://huggingface.co/docs/transformers/internal/generation_utils
 
@@ -683,9 +684,9 @@ def inference(
 
         text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device)
         text_tokens = F.pad(text_tokens, (0, 1))  # This may not be necessary.
-        assert (
-            text_tokens.shape[-1] < 400
-        ), "Too much text provided. Break the text up into separate segments and re-try inference."
+        assert text_tokens.shape[-1] < 400, (
+            "Too much text provided. Break the text up into separate segments and re-try inference."
+        )
 
         if voice_samples is not None:
             (
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 432b29f5e1..b542030f13 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -3,14 +3,14 @@
 import os
 from dataclasses import dataclass, field, replace
 from itertools import chain
-from typing import Dict, List, Tuple, Union
+from pathlib import Path
+from typing import Any
 
 import numpy as np
 import torch
 import torch.distributed as dist
 import torchaudio
 from coqpit import Coqpit
-from librosa.filters import mel as librosa_mel_fn
 from monotonic_alignment_search import maximum_path
 from torch import nn
 from torch.nn import functional as F
@@ -21,7 +21,7 @@
 from trainer.trainer_utils import get_optimizer, get_scheduler
 
 from TTS.tts.configs.shared_configs import CharactersConfig
-from TTS.tts.datasets.dataset import TTSDataset, _parse_sample
+from TTS.tts.datasets.dataset import TTSDataset, _parse_sample, get_attribute_balancer_weights
 from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
 from TTS.tts.layers.vits.discriminator import VitsDiscriminator
 from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder
@@ -35,6 +35,7 @@
 from TTS.tts.utils.text.characters import BaseCharacters, BaseVocabulary, _characters, _pad, _phonemes, _punctuations
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment
+from TTS.utils.audio.torch_transforms import spec_to_mel, wav_to_mel, wav_to_spec
 from TTS.utils.samplers import BucketBatchSampler
 from TTS.vocoder.models.hifigan_generator import HifiganGenerator
 from TTS.vocoder.utils.generic_utils import plot_results
@@ -45,10 +46,6 @@
 # IO / Feature extraction
 ##############################
 
-# pylint: disable=global-statement
-hann_window = {}
-mel_basis = {}
-
 
 @torch.no_grad()
 def weights_reset(m: nn.Module):
@@ -78,143 +75,6 @@ def load_audio(file_path):
     return x, sr
 
 
-def _amp_to_db(x, C=1, clip_val=1e-5):
-    return torch.log(torch.clamp(x, min=clip_val) * C)
-
-
-def _db_to_amp(x, C=1):
-    return torch.exp(x) / C
-
-
-def amp_to_db(magnitudes):
-    output = _amp_to_db(magnitudes)
-    return output
-
-
-def db_to_amp(magnitudes):
-    output = _db_to_amp(magnitudes)
-    return output
-
-
-def wav_to_spec(y, n_fft, hop_length, win_length, center=False):
-    """
-    Args Shapes:
-        - y : :math:`[B, 1, T]`
-
-    Return Shapes:
-        - spec : :math:`[B,C,T]`
-    """
-    y = y.squeeze(1)
-
-    if torch.min(y) < -1.0:
-        logger.info("min value is %.3f", torch.min(y))
-    if torch.max(y) > 1.0:
-        logger.info("max value is %.3f", torch.max(y))
-
-    global hann_window
-    dtype_device = str(y.dtype) + "_" + str(y.device)
-    wnsize_dtype_device = str(win_length) + "_" + dtype_device
-    if wnsize_dtype_device not in hann_window:
-        hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device)
-
-    y = torch.nn.functional.pad(
-        y.unsqueeze(1),
-        (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
-        mode="reflect",
-    )
-    y = y.squeeze(1)
-
-    spec = torch.view_as_real(
-        torch.stft(
-            y,
-            n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            window=hann_window[wnsize_dtype_device],
-            center=center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-    )
-
-    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
-    return spec
-
-
-def spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax):
-    """
-    Args Shapes:
-        - spec : :math:`[B,C,T]`
-
-    Return Shapes:
-        - mel : :math:`[B,C,T]`
-    """
-    global mel_basis
-    dtype_device = str(spec.dtype) + "_" + str(spec.device)
-    fmax_dtype_device = str(fmax) + "_" + dtype_device
-    if fmax_dtype_device not in mel_basis:
-        mel = librosa_mel_fn(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
-        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
-    mel = torch.matmul(mel_basis[fmax_dtype_device], spec)
-    mel = amp_to_db(mel)
-    return mel
-
-
-def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False):
-    """
-    Args Shapes:
-        - y : :math:`[B, 1, T]`
-
-    Return Shapes:
-        - spec : :math:`[B,C,T]`
-    """
-    y = y.squeeze(1)
-
-    if torch.min(y) < -1.0:
-        logger.info("min value is %.3f", torch.min(y))
-    if torch.max(y) > 1.0:
-        logger.info("max value is %.3f", torch.max(y))
-
-    global mel_basis, hann_window
-    dtype_device = str(y.dtype) + "_" + str(y.device)
-    fmax_dtype_device = str(fmax) + "_" + dtype_device
-    wnsize_dtype_device = str(win_length) + "_" + dtype_device
-    if fmax_dtype_device not in mel_basis:
-        mel = librosa_mel_fn(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
-        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
-    if wnsize_dtype_device not in hann_window:
-        hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device)
-
-    y = torch.nn.functional.pad(
-        y.unsqueeze(1),
-        (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
-        mode="reflect",
-    )
-    y = y.squeeze(1)
-
-    spec = torch.view_as_real(
-        torch.stft(
-            y,
-            n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            window=hann_window[wnsize_dtype_device],
-            center=center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-    )
-
-    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
-    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
-    spec = amp_to_db(spec)
-    return spec
-
-
 #############################
 # CONFIGS
 #############################
@@ -236,30 +96,6 @@ class VitsAudioConfig(Coqpit):
 ##############################
 
 
-def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: dict = None):
-    """Create inverse frequency weights for balancing the dataset.
-    Use `multi_dict` to scale relative weights."""
-    attr_names_samples = np.array([item[attr_name] for item in items])
-    unique_attr_names = np.unique(attr_names_samples).tolist()
-    attr_idx = [unique_attr_names.index(l) for l in attr_names_samples]
-    attr_count = np.array([len(np.where(attr_names_samples == l)[0]) for l in unique_attr_names])
-    weight_attr = 1.0 / attr_count
-    dataset_samples_weight = np.array([weight_attr[l] for l in attr_idx])
-    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
-    if multi_dict is not None:
-        # check if all keys are in the multi_dict
-        for k in multi_dict:
-            assert k in unique_attr_names, f"{k} not in {unique_attr_names}"
-        # scale weights
-        multiplier_samples = np.array([multi_dict.get(item[attr_name], 1.0) for item in items])
-        dataset_samples_weight *= multiplier_samples
-    return (
-        torch.from_numpy(dataset_samples_weight).float(),
-        unique_attr_names,
-        np.unique(dataset_samples_weight).tolist(),
-    )
-
-
 class VitsDataset(TTSDataset):
     def __init__(self, model_args, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -565,12 +401,12 @@ class VitsArgs(Coqpit):
     dilation_rate_flow: int = 1
     num_layers_flow: int = 4
     resblock_type_decoder: str = "1"
-    resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
-    resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
-    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
+    resblock_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes_decoder: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates_decoder: list[int] = field(default_factory=lambda: [8, 8, 2, 2])
     upsample_initial_channel_decoder: int = 512
-    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
-    periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
+    upsample_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    periods_multi_period_discriminator: list[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
     use_sdp: bool = True
     noise_scale: float = 1.0
     inference_noise_scale: float = 0.667
@@ -583,7 +419,7 @@ class VitsArgs(Coqpit):
     use_speaker_embedding: bool = False
     num_speakers: int = 0
     speakers_file: str = None
-    d_vector_file: List[str] = None
+    d_vector_file: list[str] = None
     speaker_embedding_channels: int = 256
     use_d_vector_file: bool = False
     d_vector_dim: int = 0
@@ -730,10 +566,6 @@ def __init__(
                 use_spectral_norm=self.args.use_spectral_norm_disriminator,
             )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def init_multispeaker(self, config: Coqpit):
         """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
         or with external `d_vectors` computed from a speaker encoder model.
@@ -848,7 +680,7 @@ def on_init_end(self, trainer):  # pylint: disable=W0613
                     raise RuntimeError(" [!] The weights of Text Encoder was not reinit check it !")
             logger.info("Text Encoder was reinit.")
 
-    def get_aux_input(self, aux_input: Dict):
+    def get_aux_input(self, aux_input: dict):
         sid, g, lid, _ = self._set_cond_input(aux_input)
         return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid}
 
@@ -878,7 +710,7 @@ def _freeze_layers(self):
                 param.requires_grad = False
 
     @staticmethod
-    def _set_cond_input(aux_input: Dict):
+    def _set_cond_input(aux_input: dict):
         """Set the speaker conditioning input based on the multi-speaker mode."""
         sid, g, lid, durations = None, None, None, None
         if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None:
@@ -900,7 +732,7 @@ def _set_cond_input(aux_input: Dict):
 
         return sid, g, lid, durations
 
-    def _set_speaker_input(self, aux_input: Dict):
+    def _set_speaker_input(self, aux_input: dict):
         d_vectors = aux_input.get("d_vectors", None)
         speaker_ids = aux_input.get("speaker_ids", None)
 
@@ -973,7 +805,7 @@ def forward(  # pylint: disable=dangerous-default-value
         y_lengths: torch.tensor,
         waveform: torch.tensor,
         aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None},
-    ) -> Dict:
+    ) -> dict:
         """Forward pass of the model.
 
         Args:
@@ -1092,7 +924,7 @@ def _set_x_lengths(x, aux_input):
             return aux_input["x_lengths"]
         return torch.tensor(x.shape[1:2]).to(x.device)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         x,
@@ -1179,7 +1011,7 @@ def inference(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference_voice_conversion(
         self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None
     ):
@@ -1220,8 +1052,8 @@ def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt):
         assert self.num_speakers > 0, "num_speakers have to be larger than 0."
         # speaker embedding
         if self.args.use_speaker_embedding and not self.args.use_d_vector_file:
-            g_src = self.emb_g(torch.from_numpy((np.array(speaker_cond_src))).unsqueeze(0)).unsqueeze(-1)
-            g_tgt = self.emb_g(torch.from_numpy((np.array(speaker_cond_tgt))).unsqueeze(0)).unsqueeze(-1)
+            g_src = self.emb_g(torch.from_numpy(np.array(speaker_cond_src)).unsqueeze(0)).unsqueeze(-1)
+            g_tgt = self.emb_g(torch.from_numpy(np.array(speaker_cond_tgt)).unsqueeze(0)).unsqueeze(-1)
         elif not self.args.use_speaker_embedding and self.args.use_d_vector_file:
             g_src = F.normalize(speaker_cond_src).unsqueeze(-1)
             g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1)
@@ -1234,7 +1066,7 @@ def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt):
         o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt)
         return o_hat, y_mask, (z, z_p, z_hat)
 
-    def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]:
+    def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> tuple[dict, dict]:
         """Perform a single training step. Run the model forward pass and compute losses.
 
         Args:
@@ -1354,9 +1186,7 @@ def _log(self, ap, batch, outputs, name_prefix="train"):  # pylint: disable=unus
         )
         return figures, audios
 
-    def train_log(
-        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
-    ):  # pylint: disable=no-self-use
+    def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int):  # pylint: disable=no-self-use
         """Create visualizations and waveform examples.
 
         For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
@@ -1374,7 +1204,7 @@ def train_log(
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int):
         return self.train_step(batch, criterion, optimizer_idx)
 
@@ -1431,8 +1261,8 @@ def get_aux_input_from_test_sentences(self, sentence_info):
             "language_name": language_name,
         }
 
-    @torch.no_grad()
-    def test_run(self, assets) -> Tuple[Dict, Dict]:
+    @torch.inference_mode()
+    def test_run(self, assets) -> tuple[dict, dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
         You can override this for a different behaviour.
@@ -1458,17 +1288,21 @@ def test_run(self, assets) -> Tuple[Dict, Dict]:
                 use_griffin_lim=True,
                 do_trim_silence=False,
             ).values()
-            test_audios["{}-audio".format(idx)] = wav
-            test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)
+            test_audios[f"{idx}-audio"] = wav
+            test_figures[f"{idx}-alignment"] = plot_alignment(alignment.permute(2, 1, 0), output_fig=False)
         return {"figures": test_figures, "audios": test_audios}
 
     def test_log(
-        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         logger.test_audios(steps, outputs["audios"], self.ap.sample_rate)
         logger.test_figures(steps, outputs["figures"])
 
-    def format_batch(self, batch: Dict) -> Dict:
+    def format_batch(self, batch: dict) -> dict:
         """Compute speaker, langugage IDs and d_vector for the batch if necessary."""
         speaker_ids = None
         language_ids = None
@@ -1532,9 +1366,9 @@ def format_batch_on_device(self, batch):
         )
 
         if self.args.encoder_sample_rate:
-            assert batch["spec"].shape[2] == int(
-                batch["mel"].shape[2] / self.interpolate_factor
-            ), f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}"
+            assert batch["spec"].shape[2] == int(batch["mel"].shape[2] / self.interpolate_factor), (
+                f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}"
+            )
         else:
             assert batch["spec"].shape[2] == batch["mel"].shape[2], f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}"
 
@@ -1591,12 +1425,12 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1, is_eval=F
     def get_data_loader(
         self,
         config: Coqpit,
-        assets: Dict,
+        assets: dict,
         is_eval: bool,
-        samples: Union[List[Dict], List[List]],
+        samples: list[dict] | list[list],
         verbose: bool,
         num_gpus: int,
-        rank: int = None,
+        rank: int | None = None,
     ) -> "DataLoader":
         if is_eval and not config.run_eval:
             loader = None
@@ -1655,7 +1489,7 @@ def get_data_loader(
                     )
         return loader
 
-    def get_optimizer(self) -> List:
+    def get_optimizer(self) -> list:
         """Initiate and return the GAN optimizers based on the config parameters.
 
         It returns 2 optimizers in a list. First one is for the discriminator
@@ -1673,7 +1507,7 @@ def get_optimizer(self) -> List:
         )
         return [optimizer0, optimizer1]
 
-    def get_lr(self) -> List:
+    def get_lr(self) -> list:
         """Set the initial learning rates for each optimizer.
 
         Returns:
@@ -1681,7 +1515,7 @@ def get_lr(self) -> List:
         """
         return [self.config.lr_disc, self.config.lr_gen]
 
-    def get_scheduler(self, optimizer) -> List:
+    def get_scheduler(self, optimizer) -> list:
         """Set the schedulers for each optimizer.
 
         Args:
@@ -1704,9 +1538,7 @@ def get_criterion(self):
 
         return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)]
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, strict=True, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, strict=True, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         """Load the model checkpoint and setup for training or inference"""
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         # compat band-aid for the pre-trained models to not use the encoder baked into the model
@@ -1733,9 +1565,7 @@ def load_checkpoint(
             self.eval()
             assert not self.training
 
-    def load_fairseq_checkpoint(
-        self, config, checkpoint_dir, eval=False, strict=True
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_fairseq_checkpoint(self, config, checkpoint_dir, eval=False, strict=True):  # pylint: disable=unused-argument, redefined-builtin
         """Load VITS checkpoints released by fairseq here: https://github.com/facebookresearch/fairseq/tree/main/examples/mms
         Performs some changes for compatibility.
 
@@ -1750,13 +1580,16 @@ def load_fairseq_checkpoint(
 
         self.disc = None
         # set paths
-        config_file = os.path.join(checkpoint_dir, "config.json")
-        checkpoint_file = os.path.join(checkpoint_dir, "G_100000.pth")
-        vocab_file = os.path.join(checkpoint_dir, "vocab.txt")
+        checkpoint_dir = Path(checkpoint_dir)
+        config_file = checkpoint_dir / "config.json"
+        checkpoint_file = checkpoint_dir / "model.pth"
+        if not checkpoint_file.is_file():
+            checkpoint_file = checkpoint_dir / "G_100000.pth"
+        vocab_file = checkpoint_dir / "vocab.txt"
         # set config params
-        with open(config_file, "r", encoding="utf-8") as file:
+        with open(config_file, encoding="utf-8") as f:
             # Load the JSON data as a dictionary
-            config_org = json.load(file)
+            config_org = json.load(f)
         self.config.audio.sample_rate = config_org["data"]["sampling_rate"]
         # self.config.add_blank = config['add_blank']
         # set tokenizer
@@ -1778,7 +1611,7 @@ def load_fairseq_checkpoint(
             assert not self.training
 
     @staticmethod
-    def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: "VitsConfig", samples: list[list] | list[dict] = None):
         """Initiate model from config
 
         Args:
@@ -1791,15 +1624,15 @@ def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]
         upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
 
         if not config.model_args.encoder_sample_rate:
-            assert (
-                upsample_rate == config.audio.hop_length
-            ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
+            assert upsample_rate == config.audio.hop_length, (
+                f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
+            )
         else:
             encoder_to_vocoder_upsampling_factor = config.audio.sample_rate / config.model_args.encoder_sample_rate
             effective_hop_length = config.audio.hop_length * encoder_to_vocoder_upsampling_factor
-            assert (
-                upsample_rate == effective_hop_length
-            ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}"
+            assert upsample_rate == effective_hop_length, (
+                f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}"
+            )
 
         ap = AudioProcessor.init_from_config(config)
         tokenizer, new_config = TTSTokenizer.init_from_config(config)
@@ -1990,7 +1823,7 @@ def to_config(self) -> "CharactersConfig":
 
 
 class FairseqVocab(BaseVocabulary):
-    def __init__(self, vocab: str):
+    def __init__(self, vocab: str | os.PathLike[Any]):
         super(FairseqVocab).__init__()
         self.vocab = vocab
 
@@ -2000,7 +1833,7 @@ def vocab(self):
         return self._vocab
 
     @vocab.setter
-    def vocab(self, vocab_file):
+    def vocab(self, vocab_file: str | os.PathLike[Any]):
         with open(vocab_file, encoding="utf-8") as f:
             self._vocab = [x.replace("\n", "") for x in f.readlines()]
         self.blank = self._vocab[0]
diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py
index 22d2720efa..2df07a0435 100644
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@@ -93,25 +93,6 @@ def load_audio(audiopath, sampling_rate):
     return audio
 
 
-def pad_or_truncate(t, length):
-    """
-    Ensure a given tensor t has a specified sequence length by either padding it with zeros or clipping it.
-
-    Args:
-        t (torch.Tensor): The input tensor to be padded or truncated.
-        length (int): The desired length of the tensor.
-
-    Returns:
-        torch.Tensor: The padded or truncated tensor.
-    """
-    tp = t[..., :length]
-    if t.shape[-1] == length:
-        tp = t
-    elif t.shape[-1] < length:
-        tp = F.pad(t, (0, length - t.shape[-1]))
-    return tp
-
-
 @dataclass
 class XttsAudioConfig(Coqpit):
     """
@@ -120,10 +101,12 @@ class XttsAudioConfig(Coqpit):
     Args:
         sample_rate (int): The sample rate in which the GPT operates.
         output_sample_rate (int): The sample rate of the output audio waveform.
+        dvae_sample_rate (int): The sample rate of the DVAE
     """
 
     sample_rate: int = 22050
     output_sample_rate: int = 24000
+    dvae_sample_rate: int = 22050
 
 
 @dataclass
@@ -194,7 +177,7 @@ class XttsArgs(Coqpit):
 
 
 class Xtts(BaseTTS):
-    """ⓍTTS model implementation.
+    """XTTS model implementation.
 
     ❗ Currently it only supports inference.
 
@@ -255,10 +238,6 @@ def init_models(self):
             cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer,
         )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     @torch.inference_mode()
     def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = 6):
         """Compute the conditioning latents for the GPT model from the given audio.
@@ -400,9 +379,9 @@ def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwa
             as latents used at inference.
 
         """
-        assert (
-            "zh-cn" if language == "zh" else language in self.config.languages
-        ), f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}"
+        assert "zh-cn" if language == "zh" else language in self.config.languages, (
+            f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}"
+        )
         # Use generally found best tuning knobs for generation.
         settings = {
             "temperature": config.temperature,
@@ -476,7 +455,7 @@ def full_inference(
             gpt_cond_chunk_len: (int) Chunk length used for cloning. It must be <= `gpt_cond_len`.
                 If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to 6 seconds.
 
-            hf_generate_kwargs: (**kwargs) The huggingface Transformers generate API is used for the autoregressive
+            hf_generate_kwargs: (`**kwargs`) The huggingface Transformers generate API is used for the autoregressive
                 transformer. Extra keyword args fed to this function get forwarded directly to that API. Documentation
                 here: https://huggingface.co/docs/transformers/internal/generation_utils
 
@@ -540,9 +519,9 @@ def inference(
             sent = sent.strip().lower()
             text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device)
 
-            assert (
-                text_tokens.shape[-1] < self.args.gpt_max_text_tokens
-            ), " ❗ XTTS can only generate text with a maximum of 400 tokens."
+            assert text_tokens.shape[-1] < self.args.gpt_max_text_tokens, (
+                " ❗ XTTS can only generate text with a maximum of 400 tokens."
+            )
 
             with torch.no_grad():
                 gpt_codes = self.gpt.generate(
@@ -648,9 +627,9 @@ def inference_stream(
             sent = sent.strip().lower()
             text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device)
 
-            assert (
-                text_tokens.shape[-1] < self.args.gpt_max_text_tokens
-            ), " ❗ XTTS can only generate text with a maximum of 400 tokens."
+            assert text_tokens.shape[-1] < self.args.gpt_max_text_tokens, (
+                " ❗ XTTS can only generate text with a maximum of 400 tokens."
+            )
 
             fake_inputs = self.gpt.compute_embeddings(
                 gpt_cond_latent.to(self.device),
@@ -738,14 +717,14 @@ def get_compatible_checkpoint_state_dict(self, model_path):
 
     def load_checkpoint(
         self,
-        config,
-        checkpoint_dir=None,
-        checkpoint_path=None,
-        vocab_path=None,
-        eval=True,
-        strict=True,
-        use_deepspeed=False,
-        speaker_file_path=None,
+        config: "XttsConfig",
+        checkpoint_dir: str | None = None,
+        checkpoint_path: str | None = None,
+        vocab_path: str | None = None,
+        eval: bool = True,
+        strict: bool = True,
+        use_deepspeed: bool = False,
+        speaker_file_path: str | None = None,
     ):
         """
         Loads a checkpoint from disk and initializes the model's state and tokenizer.
@@ -761,7 +740,9 @@ def load_checkpoint(
         Returns:
             None
         """
-
+        if checkpoint_dir is not None and Path(checkpoint_dir).is_file():
+            msg = f"You passed a file to `checkpoint_dir=`. Use `checkpoint_path={checkpoint_dir}` instead."
+            raise ValueError(msg)
         model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth")
         if vocab_path is None:
             if checkpoint_dir is not None and (Path(checkpoint_dir) / "vocab.json").is_file():
diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py
index 22e46b683a..d0269060c8 100644
--- a/TTS/tts/utils/data.py
+++ b/TTS/tts/utils/data.py
@@ -11,7 +11,7 @@ def _pad_data(x, length):
 
 
 def prepare_data(inputs):
-    max_len = max((len(x) for x in inputs))
+    max_len = max(len(x) for x in inputs)
     return np.stack([_pad_data(x, max_len) for x in inputs])
 
 
@@ -23,7 +23,7 @@ def _pad_tensor(x, length):
 
 
 def prepare_tensor(inputs, out_steps):
-    max_len = max((x.shape[1] for x in inputs))
+    max_len = max(x.shape[1] for x in inputs)
     remainder = max_len % out_steps
     pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
     return np.stack([_pad_tensor(x, pad_len) for x in inputs])
@@ -46,7 +46,7 @@ def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray:
 
 def prepare_stop_target(inputs, out_steps):
     """Pad row vectors with 1."""
-    max_len = max((x.shape[0] for x in inputs))
+    max_len = max(x.shape[0] for x in inputs)
     remainder = max_len % out_steps
     pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
     return np.stack([_pad_stop_target(x, pad_len) for x in inputs])
diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py
index d1722501f7..a3648eff4b 100644
--- a/TTS/tts/utils/helpers.py
+++ b/TTS/tts/utils/helpers.py
@@ -33,7 +33,7 @@ def inverse_transform(self, X):
 
 
 # from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
-def sequence_mask(sequence_length, max_len=None):
+def sequence_mask(sequence_length: torch.Tensor, max_len: int | None = None) -> torch.Tensor:
     """Create a sequence mask for filtering padding in a sequence tensor.
 
     Args:
@@ -44,7 +44,7 @@ def sequence_mask(sequence_length, max_len=None):
         - mask: :math:`[B, T_max]`
     """
     if max_len is None:
-        max_len = sequence_length.max()
+        max_len = int(sequence_length.max())
     seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device)
     # B x T_max
     return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1)
@@ -105,9 +105,9 @@ def rand_segments(
         _x_lenghts[len_diff < 0] = segment_size
         len_diff = _x_lenghts - segment_size
     else:
-        assert all(
-            len_diff > 0
-        ), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}"
+        assert all(len_diff > 0), (
+            f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}"
+        )
     segment_indices = (torch.rand([B]).type_as(x) * (len_diff + 1)).long()
     ret = segment(x, segment_indices, segment_size, pad_short=pad_short)
     return ret, segment_indices
@@ -143,22 +143,75 @@ def convert_pad_shape(pad_shape: list[list]) -> list:
     return [item for sublist in l for item in sublist]
 
 
-def generate_path(duration, mask):
-    """
+def generate_path(duration: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+    """Generate alignment path based on the given segment durations.
+
     Shapes:
         - duration: :math:`[B, T_en]`
         - mask: :math:'[B, T_en, T_de]`
         - path: :math:`[B, T_en, T_de]`
     """
     b, t_x, t_y = mask.shape
-    cum_duration = torch.cumsum(duration, 1)
+    cum_duration = torch.cumsum(duration, dim=1)
 
     cum_duration_flat = cum_duration.view(b * t_x)
     path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
     path = path.view(b, t_x, t_y)
     path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
-    path = path * mask
-    return path
+    return path * mask
+
+
+def generate_attention(
+    duration: torch.Tensor, x_mask: torch.Tensor, y_mask: torch.Tensor | None = None
+) -> torch.Tensor:
+    """Generate an attention map from the linear scale durations.
+
+    Args:
+        duration (Tensor): Linear scale durations.
+        x_mask (Tensor): Mask for the input (character) sequence.
+        y_mask (Tensor): Mask for the output (spectrogram) sequence. Compute it from the predicted durations
+            if None. Defaults to None.
+
+    Shapes
+       - duration: :math:`(B, T_{en})`
+       - x_mask: :math:`(B, T_{en})`
+       - y_mask: :math:`(B, T_{de})`
+    """
+    # compute decode mask from the durations
+    if y_mask is None:
+        y_lengths = duration.sum(dim=1).long()
+        y_lengths[y_lengths < 1] = 1
+        y_mask = sequence_mask(y_lengths).unsqueeze(1).to(duration.dtype)
+    attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+    return generate_path(duration, attn_mask.squeeze(1)).to(duration.dtype)
+
+
+def expand_encoder_outputs(
+    x: torch.Tensor, duration: torch.Tensor, x_mask: torch.Tensor, y_lengths: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Generate attention alignment map from durations and expand encoder outputs.
+
+    Shapes:
+        - x: Encoder output :math:`(B, D_{en}, T_{en})`
+        - duration: :math:`(B, T_{en})`
+        - x_mask: :math:`(B, T_{en})`
+        - y_lengths: :math:`(B)`
+
+    Examples::
+
+        encoder output: [a,b,c,d]
+        durations: [1, 3, 2, 1]
+
+        expanded: [a, b, b, b, c, c, d]
+        attention map: [[0, 0, 0, 0, 0, 0, 1],
+                        [0, 0, 0, 0, 1, 1, 0],
+                        [0, 1, 1, 1, 0, 0, 0],
+                        [1, 0, 0, 0, 0, 0, 0]]
+    """
+    y_mask = sequence_mask(y_lengths).unsqueeze(1).to(x.dtype)
+    attn = generate_attention(duration, x_mask, y_mask)
+    x_expanded = torch.einsum("kmn, kjm -> kjn", [attn.float(), x])
+    return x_expanded, attn, y_mask
 
 
 def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=1.0):
diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py
index f134daf58e..5ce7759dd8 100644
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional
 
 import fsspec
 import numpy as np
@@ -27,8 +27,8 @@ class LanguageManager(BaseIDManager):
 
     def __init__(
         self,
-        language_ids_file_path: str = "",
-        config: Coqpit = None,
+        language_ids_file_path: str | os.PathLike[Any] = "",
+        config: Coqpit | None = None,
     ):
         super().__init__(id_file_path=language_ids_file_path)
 
@@ -40,11 +40,11 @@ def num_languages(self) -> int:
         return len(list(self.name_to_id.keys()))
 
     @property
-    def language_names(self) -> List:
+    def language_names(self) -> list:
         return list(self.name_to_id.keys())
 
     @staticmethod
-    def parse_language_ids_from_config(c: Coqpit) -> Dict:
+    def parse_language_ids_from_config(c: Coqpit) -> dict:
         """Set language id from config.
 
         Args:
@@ -70,13 +70,13 @@ def set_language_ids_from_config(self, c: Coqpit) -> None:
         self.name_to_id = self.parse_language_ids_from_config(c)
 
     @staticmethod
-    def parse_ids_from_data(items: List, parse_key: str) -> Any:
+    def parse_ids_from_data(items: list, parse_key: str) -> Any:
         raise NotImplementedError
 
-    def set_ids_from_data(self, items: List, parse_key: str) -> Any:
+    def set_ids_from_data(self, items: list, parse_key: str) -> Any:
         raise NotImplementedError
 
-    def save_ids_to_file(self, file_path: str) -> None:
+    def save_ids_to_file(self, file_path: str | os.PathLike[Any]) -> None:
         """Save language IDs to a json file.
 
         Args:
diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py
index 6a2f7df67b..49e93454f2 100644
--- a/TTS/tts/utils/managers.py
+++ b/TTS/tts/utils/managers.py
@@ -1,6 +1,7 @@
 import json
+import os
 import random
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any
 
 import fsspec
 import numpy as np
@@ -12,7 +13,8 @@
 from TTS.utils.generic_utils import is_pytorch_at_least_2_4
 
 
-def load_file(path: str):
+def load_file(path: str | os.PathLike[Any]):
+    path = str(path)
     if path.endswith(".json"):
         with fsspec.open(path, "r") as f:
             return json.load(f)
@@ -23,7 +25,8 @@ def load_file(path: str):
         raise ValueError("Unsupported file type")
 
 
-def save_file(obj: Any, path: str):
+def save_file(obj: Any, path: str | os.PathLike[Any]):
+    path = str(path)
     if path.endswith(".json"):
         with fsspec.open(path, "w") as f:
             json.dump(obj, f, indent=4)
@@ -39,23 +42,23 @@ class BaseIDManager:
     It defines common `ID` manager specific functions.
     """
 
-    def __init__(self, id_file_path: str = ""):
+    def __init__(self, id_file_path: str | os.PathLike[Any] = ""):
         self.name_to_id = {}
 
         if id_file_path:
             self.load_ids_from_file(id_file_path)
 
     @staticmethod
-    def _load_json(json_file_path: str) -> Dict:
-        with fsspec.open(json_file_path, "r") as f:
+    def _load_json(json_file_path: str | os.PathLike[Any]) -> dict:
+        with fsspec.open(str(json_file_path), "r") as f:
             return json.load(f)
 
     @staticmethod
-    def _save_json(json_file_path: str, data: dict) -> None:
-        with fsspec.open(json_file_path, "w") as f:
+    def _save_json(json_file_path: str | os.PathLike[Any], data: dict) -> None:
+        with fsspec.open(str(json_file_path), "w") as f:
             json.dump(data, f, indent=4)
 
-    def set_ids_from_data(self, items: List, parse_key: str) -> None:
+    def set_ids_from_data(self, items: list, parse_key: str) -> None:
         """Set IDs from data samples.
 
         Args:
@@ -63,7 +66,7 @@ def set_ids_from_data(self, items: List, parse_key: str) -> None:
         """
         self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key)
 
-    def load_ids_from_file(self, file_path: str) -> None:
+    def load_ids_from_file(self, file_path: str | os.PathLike[Any]) -> None:
         """Set IDs from a file.
 
         Args:
@@ -71,7 +74,7 @@ def load_ids_from_file(self, file_path: str) -> None:
         """
         self.name_to_id = load_file(file_path)
 
-    def save_ids_to_file(self, file_path: str) -> None:
+    def save_ids_to_file(self, file_path: str | os.PathLike[Any]) -> None:
         """Save IDs to a json file.
 
         Args:
@@ -93,7 +96,7 @@ def get_random_id(self) -> Any:
         return None
 
     @staticmethod
-    def parse_ids_from_data(items: List, parse_key: str) -> Tuple[Dict]:
+    def parse_ids_from_data(items: list, parse_key: str) -> tuple[dict]:
         """Parse IDs from data samples retured by `load_tts_samples()`.
 
         Args:
@@ -130,10 +133,10 @@ class EmbeddingManager(BaseIDManager):
 
     def __init__(
         self,
-        embedding_file_path: Union[str, List[str]] = "",
-        id_file_path: str = "",
-        encoder_model_path: str = "",
-        encoder_config_path: str = "",
+        embedding_file_path: str | os.PathLike[Any] | list[str | os.PathLike[Any]] = "",
+        id_file_path: str | os.PathLike[Any] = "",
+        encoder_model_path: str | os.PathLike[Any] = "",
+        encoder_config_path: str | os.PathLike[Any] = "",
         use_cuda: bool = False,
     ):
         super().__init__(id_file_path=id_file_path)
@@ -176,7 +179,7 @@ def embedding_names(self):
         """Get embedding names."""
         return list(self.embeddings_by_names.keys())
 
-    def save_embeddings_to_file(self, file_path: str) -> None:
+    def save_embeddings_to_file(self, file_path: str | os.PathLike[Any]) -> None:
         """Save embeddings to a json file.
 
         Args:
@@ -185,7 +188,7 @@ def save_embeddings_to_file(self, file_path: str) -> None:
         save_file(self.embeddings, file_path)
 
     @staticmethod
-    def read_embeddings_from_file(file_path: str):
+    def read_embeddings_from_file(file_path: str | os.PathLike[Any]):
         """Load embeddings from a json file.
 
         Args:
@@ -204,7 +207,7 @@ def read_embeddings_from_file(file_path: str):
                 embeddings_by_names[x["name"]].append(x["embedding"])
         return name_to_id, clip_ids, embeddings, embeddings_by_names
 
-    def load_embeddings_from_file(self, file_path: str) -> None:
+    def load_embeddings_from_file(self, file_path: str | os.PathLike[Any]) -> None:
         """Load embeddings from a json file.
 
         Args:
@@ -214,7 +217,7 @@ def load_embeddings_from_file(self, file_path: str) -> None:
             file_path
         )
 
-    def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None:
+    def load_embeddings_from_list_of_files(self, file_paths: list[str | os.PathLike[Any]]) -> None:
         """Load embeddings from a list of json files and don't allow duplicate keys.
 
         Args:
@@ -239,7 +242,7 @@ def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None:
         # reset name_to_id to get the right speaker ids
         self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)}
 
-    def get_embedding_by_clip(self, clip_idx: str) -> List:
+    def get_embedding_by_clip(self, clip_idx: str) -> list:
         """Get embedding by clip ID.
 
         Args:
@@ -250,7 +253,7 @@ def get_embedding_by_clip(self, clip_idx: str) -> List:
         """
         return self.embeddings[clip_idx]["embedding"]
 
-    def get_embeddings_by_name(self, idx: str) -> List[List]:
+    def get_embeddings_by_name(self, idx: str) -> list[list]:
         """Get all embeddings of a speaker.
 
         Args:
@@ -261,7 +264,7 @@ def get_embeddings_by_name(self, idx: str) -> List[List]:
         """
         return self.embeddings_by_names[idx]
 
-    def get_embeddings_by_names(self) -> Dict:
+    def get_embeddings_by_names(self) -> dict:
         """Get all embeddings by names.
 
         Returns:
@@ -310,10 +313,12 @@ def get_random_embedding(self) -> Any:
 
         return None
 
-    def get_clips(self) -> List:
+    def get_clips(self) -> list:
         return sorted(self.embeddings.keys())
 
-    def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> None:
+    def init_encoder(
+        self, model_path: str | os.PathLike[Any], config_path: str | os.PathLike[Any], use_cuda=False
+    ) -> None:
         """Initialize a speaker encoder model.
 
         Args:
@@ -325,11 +330,12 @@ def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> Non
         self.encoder_config = load_config(config_path)
         self.encoder = setup_encoder_model(self.encoder_config)
         self.encoder_criterion = self.encoder.load_checkpoint(
-            self.encoder_config, model_path, eval=True, use_cuda=use_cuda, cache=True
+            self.encoder_config, str(model_path), eval=True, use_cuda=use_cuda, cache=True
         )
         self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
 
-    def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list:
+    @torch.inference_mode()
+    def compute_embedding_from_clip(self, wav_file: str | os.PathLike[Any] | list[str | os.PathLike[Any]]) -> list:
         """Compute a embedding from a given audio file.
 
         Args:
@@ -366,7 +372,7 @@ def _compute(wav_file: str):
         embedding = _compute(wav_file)
         return embedding[0].tolist()
 
-    def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List:
+    def compute_embeddings(self, feats: torch.Tensor | np.ndarray) -> list:
         """Compute embedding from features.
 
         Args:
diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
index 5229af81c5..6fab27de5a 100644
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@@ -1,7 +1,7 @@
 import json
 import logging
 import os
-from typing import Any, Dict, List, Union
+from typing import Any
 
 import fsspec
 import numpy as np
@@ -56,11 +56,11 @@ class SpeakerManager(EmbeddingManager):
 
     def __init__(
         self,
-        data_items: List[List[Any]] = None,
+        data_items: list[list[Any]] | None = None,
         d_vectors_file_path: str = "",
-        speaker_id_file_path: str = "",
-        encoder_model_path: str = "",
-        encoder_config_path: str = "",
+        speaker_id_file_path: str | os.PathLike[Any] = "",
+        encoder_model_path: str | os.PathLike[Any] = "",
+        encoder_config_path: str | os.PathLike[Any] = "",
         use_cuda: bool = False,
     ):
         super().__init__(
@@ -82,11 +82,11 @@ def num_speakers(self):
     def speaker_names(self):
         return list(self.name_to_id.keys())
 
-    def get_speakers(self) -> List:
+    def get_speakers(self) -> list:
         return self.name_to_id
 
     @staticmethod
-    def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
+    def init_from_config(config: "Coqpit", samples: list[list] | list[dict] = None) -> "SpeakerManager":
         """Initialize a speaker manager from config
 
         Args:
@@ -150,7 +150,7 @@ def save_speaker_mapping(out_path, speaker_mapping):
             json.dump(speaker_mapping, f, indent=4)
 
 
-def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager:
+def get_speaker_manager(c: Coqpit, data: list = None, restore_path: str = None, out_path: str = None) -> SpeakerManager:
     """Initiate a `SpeakerManager` instance by the provided config.
 
     Args:
@@ -185,9 +185,9 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
             elif not c.use_d_vector_file:  # restor speaker manager with speaker ID file.
                 speaker_ids_from_data = speaker_manager.name_to_id
                 speaker_manager.load_ids_from_file(speakers_file)
-                assert all(
-                    speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data
-                ), " [!] You cannot introduce new speakers to a pre-trained model."
+                assert all(speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data), (
+                    " [!] You cannot introduce new speakers to a pre-trained model."
+                )
         elif c.use_d_vector_file and c.d_vector_file:
             # new speaker manager with external speaker embeddings.
             speaker_manager.load_embeddings_from_file(c.d_vector_file)
diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py
index eddf05db3f..660370a832 100644
--- a/TTS/tts/utils/ssim.py
+++ b/TTS/tts/utils/ssim.py
@@ -1,6 +1,5 @@
 # Adopted from https://github.com/photosynthesis-team/piq
 
-from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -24,11 +23,11 @@ def _reduce(x: torch.Tensor, reduction: str = "mean") -> torch.Tensor:
 
 
 def _validate_input(
-    tensors: List[torch.Tensor],
-    dim_range: Tuple[int, int] = (0, -1),
-    data_range: Tuple[float, float] = (0.0, -1.0),
+    tensors: list[torch.Tensor],
+    dim_range: tuple[int, int] = (0, -1),
+    data_range: tuple[float, float] = (0.0, -1.0),
     # size_dim_range: Tuple[float, float] = (0., -1.),
-    size_range: Optional[Tuple[int, int]] = None,
+    size_range: tuple[int, int] | None = None,
 ) -> None:
     r"""Check that input(-s)  satisfies the requirements
     Args:
@@ -50,16 +49,16 @@ def _validate_input(
         if size_range is None:
             assert t.size() == x.size(), f"Expected tensors with same size, got {t.size()} and {x.size()}"
         else:
-            assert (
-                t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]]
-            ), f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}"
+            assert t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]], (
+                f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}"
+            )
 
         if dim_range[0] == dim_range[1]:
             assert t.dim() == dim_range[0], f"Expected number of dimensions to be {dim_range[0]}, got {t.dim()}"
         elif dim_range[0] < dim_range[1]:
-            assert (
-                dim_range[0] <= t.dim() <= dim_range[1]
-            ), f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}"
+            assert dim_range[0] <= t.dim() <= dim_range[1], (
+                f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}"
+            )
 
         if data_range[0] < data_range[1]:
             assert data_range[0] <= t.min(), f"Expected values to be greater or equal to {data_range[0]}, got {t.min()}"
@@ -89,13 +88,13 @@ def ssim(
     y: torch.Tensor,
     kernel_size: int = 11,
     kernel_sigma: float = 1.5,
-    data_range: Union[int, float] = 1.0,
+    data_range: int | float = 1.0,
     reduction: str = "mean",
     full: bool = False,
     downsample: bool = True,
     k1: float = 0.01,
     k2: float = 0.03,
-) -> List[torch.Tensor]:
+) -> list[torch.Tensor]:
     r"""Interface of Structural Similarity (SSIM) index.
     Inputs supposed to be in range ``[0, data_range]``.
     To match performance with skimage and tensorflow set ``'downsample' = True``.
@@ -218,7 +217,7 @@ def __init__(
         k2: float = 0.03,
         downsample: bool = True,
         reduction: str = "mean",
-        data_range: Union[int, float] = 1.0,
+        data_range: int | float = 1.0,
     ) -> None:
         super().__init__()
 
@@ -270,7 +269,7 @@ def _ssim_per_channel(
     kernel: torch.Tensor,
     k1: float = 0.01,
     k2: float = 0.03,
-) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
     r"""Calculate Structural Similarity (SSIM) index for X and Y per channel.
 
     Args:
@@ -286,8 +285,7 @@ def _ssim_per_channel(
     """
     if x.size(-1) < kernel.size(-1) or x.size(-2) < kernel.size(-2):
         raise ValueError(
-            f"Kernel size can't be greater than actual input size. Input size: {x.size()}. "
-            f"Kernel size: {kernel.size()}"
+            f"Kernel size can't be greater than actual input size. Input size: {x.size()}. Kernel size: {kernel.size()}"
         )
 
     c1 = k1**2
@@ -321,7 +319,7 @@ def _ssim_per_channel_complex(
     kernel: torch.Tensor,
     k1: float = 0.01,
     k2: float = 0.03,
-) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
     r"""Calculate Structural Similarity (SSIM) index for Complex X and Y per channel.
 
     Args:
@@ -338,8 +336,7 @@ def _ssim_per_channel_complex(
     n_channels = x.size(1)
     if x.size(-2) < kernel.size(-1) or x.size(-3) < kernel.size(-2):
         raise ValueError(
-            f"Kernel size can't be greater than actual input size. Input size: {x.size()}. "
-            f"Kernel size: {kernel.size()}"
+            f"Kernel size can't be greater than actual input size. Input size: {x.size()}. Kernel size: {kernel.size()}"
         )
 
     c1 = k1**2
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 797151c254..c09c3f5aa2 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -1,17 +1,12 @@
-from typing import Dict
-
 import numpy as np
 import torch
 from torch import nn
 
 
-def numpy_to_torch(np_array, dtype, cuda=False, device="cpu"):
-    if cuda:
-        device = "cuda"
+def numpy_to_torch(np_array: np.ndarray, dtype: torch.dtype, device: str | torch.device = "cpu") -> torch.Tensor | None:
     if np_array is None:
         return None
-    tensor = torch.as_tensor(np_array, dtype=dtype, device=device)
-    return tensor
+    return torch.as_tensor(np_array, dtype=dtype, device=device)
 
 
 def compute_style_mel(style_wav, ap, cuda=False, device="cpu"):
@@ -32,7 +27,7 @@ def run_model_torch(
     style_text: str = None,
     d_vector: torch.Tensor = None,
     language_id: torch.Tensor = None,
-) -> Dict:
+) -> dict:
     """Run a torch model for inference. It does not support batch inference.
 
     Args:
@@ -76,18 +71,14 @@ def inv_spectrogram(postnet_output, ap, CONFIG):
     return wav
 
 
-def id_to_torch(aux_id, cuda=False, device="cpu"):
-    if cuda:
-        device = "cuda"
+def id_to_torch(aux_id, device: str | torch.device = "cpu") -> torch.Tensor | None:
     if aux_id is not None:
         aux_id = np.asarray(aux_id)
         aux_id = torch.from_numpy(aux_id).to(device)
     return aux_id
 
 
-def embedding_to_torch(d_vector, cuda=False, device="cpu"):
-    if cuda:
-        device = "cuda"
+def embedding_to_torch(d_vector, device: str | torch.device = "cpu") -> torch.Tensor | None:
     if d_vector is not None:
         d_vector = np.asarray(d_vector)
         d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor)
diff --git a/TTS/tts/utils/text/bangla/phonemizer.py b/TTS/tts/utils/text/bangla/phonemizer.py
index cddcb00fd5..1537240380 100644
--- a/TTS/tts/utils/text/bangla/phonemizer.py
+++ b/TTS/tts/utils/text/bangla/phonemizer.py
@@ -45,7 +45,7 @@ def tag_text(text: str):
     # create start and end
     text = "start" + text + "end"
     # tag text
-    parts = re.split("[\u0600-\u06FF]+", text)
+    parts = re.split("[\u0600-\u06ff]+", text)
     # remove non chars
     parts = [p for p in parts if p.strip()]
     # unique parts
diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py
index 4bf9bf6bd5..f8beaef036 100644
--- a/TTS/tts/utils/text/characters.py
+++ b/TTS/tts/utils/text/characters.py
@@ -1,6 +1,5 @@
 import logging
 from dataclasses import replace
-from typing import Dict
 
 from TTS.tts.configs.shared_configs import CharactersConfig
 
@@ -47,7 +46,7 @@ class BaseVocabulary:
         vocab (Dict): A dictionary of characters and their corresponding indices.
     """
 
-    def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
+    def __init__(self, vocab: dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
         self.vocab = vocab
         self.pad = pad
         self.blank = blank
@@ -290,9 +289,9 @@ def _create_vocab(self):
         self.vocab = _vocab + list(self._punctuations)
         if self.is_unique:
             duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
-            assert (
-                len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
-            ), f" [!] There are duplicate characters in the character set. {duplicates}"
+            assert len(self.vocab) == len(self._char_to_id) == len(self._id_to_char), (
+                f" [!] There are duplicate characters in the character set. {duplicates}"
+            )
 
     def char_to_id(self, char: str) -> int:
         try:
diff --git a/TTS/tts/utils/text/chinese_mandarin/numbers.py b/TTS/tts/utils/text/chinese_mandarin/numbers.py
index 4787ea6100..3e6a043918 100644
--- a/TTS/tts/utils/text/chinese_mandarin/numbers.py
+++ b/TTS/tts/utils/text/chinese_mandarin/numbers.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 
 # Licensed under WTFPL or the Unlicense or CC0.
 # This uses Python 3, but it's easy to port to Python 2 by changing
diff --git a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py
index e9d62e9d06..4dccdd5778 100644
--- a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py
+++ b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py
@@ -1,5 +1,3 @@
-from typing import List
-
 try:
     import jieba
     import pypinyin
@@ -9,7 +7,7 @@
 from .pinyinToPhonemes import PINYIN_DICT
 
 
-def _chinese_character_to_pinyin(text: str) -> List[str]:
+def _chinese_character_to_pinyin(text: str) -> list[str]:
     pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)
     pinyins_flat_list = [item for sublist in pinyins for item in sublist]
     return pinyins_flat_list
@@ -25,9 +23,9 @@ def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
 def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str:
     tokenized_text = jieba.cut(text, HMM=False)
     tokenized_text = " ".join(tokenized_text)
-    pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
+    pinyined_text: list[str] = _chinese_character_to_pinyin(tokenized_text)
 
-    results: List[str] = []
+    results: list[str] = []
 
     for token in pinyined_text:
         if token[-1] in "12345":  # TODO transform to is_pinyin()
diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py
index f496b9f0dd..795ab246d2 100644
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@@ -1,7 +1,6 @@
 """Set of default text cleaners"""
 
 import re
-from typing import Optional
 from unicodedata import normalize
 
 from anyascii import anyascii
@@ -47,7 +46,7 @@ def remove_aux_symbols(text: str) -> str:
     return text
 
 
-def replace_symbols(text: str, lang: Optional[str] = "en") -> str:
+def replace_symbols(text: str, lang: str | None = "en") -> str:
     """Replace symbols based on the language tag.
 
     Args:
diff --git a/TTS/tts/utils/text/cmudict.py b/TTS/tts/utils/text/cmudict.py
index f206fb043b..9c0df06196 100644
--- a/TTS/tts/utils/text/cmudict.py
+++ b/TTS/tts/utils/text/cmudict.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
 import re
 
 VALID_SYMBOLS = [
@@ -121,7 +119,7 @@ def get_arpabet(word, cmudict, punctuation_symbols):
             word = word[:-1]
         arpabet = cmudict.lookup(word)
         if arpabet is not None:
-            return first_symbol + "{%s}" % arpabet[0] + last_symbol
+            return first_symbol + "{%s}" % arpabet[0] + last_symbol  # noqa: UP031
         return first_symbol + word + last_symbol
 
 
diff --git a/TTS/tts/utils/text/english/abbreviations.py b/TTS/tts/utils/text/english/abbreviations.py
index cd93c13c8e..20042b255b 100644
--- a/TTS/tts/utils/text/english/abbreviations.py
+++ b/TTS/tts/utils/text/english/abbreviations.py
@@ -2,7 +2,7 @@
 
 # List of (regular expression, replacement) pairs for abbreviations in english:
 abbreviations_en = [
-    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
     for x in [
         ("mrs", "misess"),
         ("mr", "mister"),
diff --git a/TTS/tts/utils/text/english/number_norm.py b/TTS/tts/utils/text/english/number_norm.py
index e8377ede87..be2a4b3084 100644
--- a/TTS/tts/utils/text/english/number_norm.py
+++ b/TTS/tts/utils/text/english/number_norm.py
@@ -1,7 +1,6 @@
-""" from https://github.com/keithito/tacotron """
+"""from https://github.com/keithito/tacotron"""
 
 import re
-from typing import Dict
 
 import inflect
 
@@ -21,7 +20,7 @@ def _expand_decimal_point(m):
     return m.group(1).replace(".", " point ")
 
 
-def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
+def __expand_currency(value: str, inflection: dict[float, str]) -> str:
     parts = value.replace(",", "").split(".")
     if len(parts) > 2:
         return f"{value} {inflection[2]}"  # Unexpected format
@@ -85,7 +84,11 @@ def _expand_number(m):
         if num % 100 == 0:
             return _inflect.number_to_words(num // 100) + " hundred"
         return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
-    return _inflect.number_to_words(num, andword="")
+    try:
+        text = _inflect.number_to_words(num, andword="")
+    except inflect.NumOutOfRangeError:
+        text = _inflect.number_to_words(num, group=1).replace(", ", " ")
+    return text
 
 
 def normalize_numbers(text):
diff --git a/TTS/tts/utils/text/french/abbreviations.py b/TTS/tts/utils/text/french/abbreviations.py
index f580dfed7b..e317bbbf3a 100644
--- a/TTS/tts/utils/text/french/abbreviations.py
+++ b/TTS/tts/utils/text/french/abbreviations.py
@@ -2,7 +2,7 @@
 
 # List of (regular expression, replacement) pairs for abbreviations in french:
 abbreviations_fr = [
-    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1])
     for x in [
         ("M", "monsieur"),
         ("Mlle", "mademoiselle"),
@@ -38,7 +38,7 @@
         ("boul", "boulevard"),
     ]
 ] + [
-    (re.compile("\\b%s" % x[0]), x[1])
+    (re.compile(f"\\b{x[0]}"), x[1])
     for x in [
         ("Mlle", "mademoiselle"),
         ("Mlles", "mesdemoiselles"),
diff --git a/TTS/tts/utils/text/korean/ko_dictionary.py b/TTS/tts/utils/text/korean/ko_dictionary.py
index 9b739339c6..706f9f5daf 100644
--- a/TTS/tts/utils/text/korean/ko_dictionary.py
+++ b/TTS/tts/utils/text/korean/ko_dictionary.py
@@ -1,4 +1,3 @@
-# coding: utf-8
 # Add the word you want to the dictionary.
 etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
 
diff --git a/TTS/tts/utils/text/korean/korean.py b/TTS/tts/utils/text/korean/korean.py
index 423aeed377..1b1e0ca0fb 100644
--- a/TTS/tts/utils/text/korean/korean.py
+++ b/TTS/tts/utils/text/korean/korean.py
@@ -1,4 +1,3 @@
-﻿# coding: utf-8
 # Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
 import re
 
diff --git a/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py b/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py
index 3c4a35bbfa..3be7354636 100644
--- a/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py
@@ -1,5 +1,3 @@
-from typing import Dict
-
 from TTS.tts.utils.text.bangla.phonemizer import bangla_text_to_phonemes
 from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 
@@ -41,7 +39,7 @@ def _phonemize(self, text, separator):
         return self.phonemize_bn(text, separator)
 
     @staticmethod
-    def supported_languages() -> Dict:
+    def supported_languages() -> dict:
         return {"bn": "Bangla"}
 
     def version(self) -> str:
diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py
index 5e701df458..6cc6ec0b37 100644
--- a/TTS/tts/utils/text/phonemizers/base.py
+++ b/TTS/tts/utils/text/phonemizers/base.py
@@ -1,6 +1,5 @@
 import abc
 import logging
-from typing import List, Tuple
 
 from TTS.tts.utils.text.punctuation import Punctuation
 
@@ -37,7 +36,7 @@ class BasePhonemizer(abc.ABC):
     def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
         # ensure the backend is installed on the system
         if not self.is_available():
-            raise RuntimeError("{} not installed on your system".format(self.name()))  # pragma: nocover
+            raise RuntimeError(f"{self.name()} not installed on your system")  # pragma: nocover
 
         # ensure the backend support the requested language
         self._language = self._init_language(language)
@@ -53,7 +52,7 @@ def _init_language(self, language):
 
         """
         if not self.is_supported_language(language):
-            raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend")
+            raise RuntimeError(f'language "{language}" is not supported by the {self.name()} backend')
         return language
 
     @property
@@ -93,7 +92,7 @@ def is_supported_language(self, language):
     def _phonemize(self, text, separator):
         """The main phonemization method"""
 
-    def _phonemize_preprocess(self, text) -> Tuple[List[str], List]:
+    def _phonemize_preprocess(self, text) -> tuple[list[str], list]:
         """Preprocess the text before phonemization
 
         1. remove spaces
diff --git a/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py b/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py
index e5fcab6e09..fa4a515d1a 100644
--- a/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py
@@ -1,5 +1,3 @@
-from typing import Dict
-
 from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes
 from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 
@@ -34,7 +32,7 @@ def _phonemize(self, text, separator):
         return self.phonemize_be(text, separator)
 
     @staticmethod
-    def supported_languages() -> Dict:
+    def supported_languages() -> dict:
         return {"be": "Belarusian"}
 
     def version(self) -> str:
diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
index a15df716e7..dbcb8994a7 100644
--- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
@@ -5,7 +5,6 @@
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import Optional
 
 from packaging.version import Version
 
@@ -104,7 +103,7 @@ class ESpeak(BasePhonemizer):
     def __init__(
         self,
         language: str,
-        backend: Optional[str] = None,
+        backend: str | None = None,
         punctuations: str = Punctuation.default_puncs(),
         keep_puncs: bool = True,
     ):
@@ -184,7 +183,7 @@ def phonemize_espeak(self, text: str, separator: str = "|", *, tie: bool = False
             else:
                 args.append("--ipa=1")
         if tie:
-            args.append("--tie=%s" % tie)
+            args.append(f"--tie={tie}")
 
         tmp = tempfile.NamedTemporaryFile(mode="w+t", delete=False, encoding="utf8")
         tmp.write(text)
diff --git a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py
index f3e9c9abd4..836fccf5b8 100644
--- a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py
@@ -1,5 +1,4 @@
 import importlib
-from typing import List
 
 import gruut
 from gruut_ipa import IPA
@@ -114,7 +113,7 @@ def is_supported_language(self, language):
         return gruut.is_language_supported(language)
 
     @staticmethod
-    def supported_languages() -> List:
+    def supported_languages() -> list:
         """Get a dictionary of supported languages.
 
         Returns:
diff --git a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
index 878e5e5296..b3b3ba4db7 100644
--- a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
@@ -1,5 +1,3 @@
-from typing import Dict
-
 from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
 from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 
@@ -51,7 +49,7 @@ def phonemize(self, text: str, separator="|", language=None) -> str:
         return self._phonemize(text, separator)
 
     @staticmethod
-    def supported_languages() -> Dict:
+    def supported_languages() -> dict:
         return {"ja-jp": "Japanese (Japan)"}
 
     def version(self) -> str:
diff --git a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
index 0bdba2137b..93930d064e 100644
--- a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
@@ -1,5 +1,3 @@
-from typing import Dict
-
 from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes
 from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 
@@ -44,7 +42,7 @@ def phonemize(self, text: str, separator: str = "", character: str = "hangeul",
         return self._phonemize(text, separator, character)
 
     @staticmethod
-    def supported_languages() -> Dict:
+    def supported_languages() -> dict:
         return {"ko-kr": "hangeul(korean)"}
 
     def version(self) -> str:
diff --git a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
index 1a9e98b091..87fb940f6b 100644
--- a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
@@ -1,5 +1,4 @@
 import logging
-from typing import Dict, List
 
 from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
 
@@ -19,7 +18,7 @@ class MultiPhonemizer:
 
     lang_to_phonemizer = {}
 
-    def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None:  # pylint: disable=dangerous-default-value
+    def __init__(self, lang_to_phonemizer_name: dict = {}) -> None:  # pylint: disable=dangerous-default-value
         for k, v in lang_to_phonemizer_name.items():
             if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys():
                 lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k]
@@ -29,7 +28,7 @@ def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None:  # pylint: disab
         self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)
 
     @staticmethod
-    def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict:
+    def init_phonemizers(lang_to_phonemizer_name: dict) -> dict:
         lang_to_phonemizer = {}
         for k, v in lang_to_phonemizer_name.items():
             lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k)
@@ -44,7 +43,7 @@ def phonemize(self, text, separator="|", language=""):
             raise ValueError("Language must be set for multi-phonemizer to phonemize.")
         return self.lang_to_phonemizer[language].phonemize(text, separator)
 
-    def supported_languages(self) -> List:
+    def supported_languages(self) -> list:
         return list(self.lang_to_phonemizer.keys())
 
     def print_logs(self, level: int = 0):
diff --git a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
index 41480c4173..9e70b03a0c 100644
--- a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
@@ -1,5 +1,3 @@
-from typing import Dict
-
 from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
 from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 
@@ -41,7 +39,7 @@ def _phonemize(self, text, separator):
         return self.phonemize_zh_cn(text, separator)
 
     @staticmethod
-    def supported_languages() -> Dict:
+    def supported_languages() -> dict:
         return {"zh-cn": "Chinese (China)"}
 
     def version(self) -> str:
diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py
index f653cdf13f..07a8753884 100644
--- a/TTS/tts/utils/text/tokenizer.py
+++ b/TTS/tts/utils/text/tokenizer.py
@@ -1,5 +1,6 @@
 import logging
-from typing import Callable, Dict, List, Union
+from collections.abc import Callable
+from typing import Union
 
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes
@@ -43,7 +44,7 @@ def __init__(
         use_phonemes=False,
         text_cleaner: Callable = None,
         characters: "BaseCharacters" = None,
-        phonemizer: Union["Phonemizer", Dict] = None,
+        phonemizer: Union["Phonemizer", dict] = None,
         add_blank: bool = False,
         use_eos_bos=False,
     ):
@@ -65,7 +66,7 @@ def characters(self, new_characters):
         self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
         self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
 
-    def encode(self, text: str) -> List[int]:
+    def encode(self, text: str) -> list[int]:
         """Encodes a string of text as a sequence of IDs."""
         token_ids = []
         for char in text:
@@ -80,14 +81,14 @@ def encode(self, text: str) -> List[int]:
                     logger.warning("Character %s not found in the vocabulary. Discarding it.", repr(char))
         return token_ids
 
-    def decode(self, token_ids: List[int]) -> str:
+    def decode(self, token_ids: list[int]) -> str:
         """Decodes a sequence of IDs to a string of text."""
         text = ""
         for token_id in token_ids:
             text += self.characters.id_to_char(token_id)
         return text
 
-    def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint: disable=unused-argument
+    def text_to_ids(self, text: str, language: str = None) -> list[int]:  # pylint: disable=unused-argument
         """Converts a string of text to a sequence of token IDs.
 
         Args:
@@ -121,15 +122,15 @@ def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint:
             text = self.pad_with_bos_eos(text)
         return text
 
-    def ids_to_text(self, id_sequence: List[int]) -> str:
+    def ids_to_text(self, id_sequence: list[int]) -> str:
         """Converts a sequence of token IDs to a string of text."""
         return self.decode(id_sequence)
 
-    def pad_with_bos_eos(self, char_sequence: List[str]):
+    def pad_with_bos_eos(self, char_sequence: list[str]):
         """Pads a sequence with the special BOS and EOS characters."""
         return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
 
-    def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
+    def intersperse_blank_char(self, char_sequence: list[str], use_blank_char: bool = False):
         """Intersperses the blank character between characters in a sequence.
 
         Use the ```blank``` character if defined else use the ```pad``` character.
@@ -163,7 +164,7 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
         """
         # init cleaners
         text_cleaner = None
-        if isinstance(config.text_cleaner, (str, list)):
+        if isinstance(config.text_cleaner, str | list):
             text_cleaner = getattr(cleaners, config.text_cleaner)
 
         # init characters
diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py
index 203091ea88..7fd4259178 100644
--- a/TTS/utils/audio/numpy_transforms.py
+++ b/TTS/utils/audio/numpy_transforms.py
@@ -1,6 +1,7 @@
 import logging
+import os
 from io import BytesIO
-from typing import Optional
+from typing import Any
 
 import librosa
 import numpy as np
@@ -20,7 +21,7 @@ def build_mel_basis(
     fft_size: int,
     num_mels: int,
     mel_fmin: int,
-    mel_fmax: Optional[int] = None,
+    mel_fmax: int | None = None,
     **kwargs,
 ) -> np.ndarray:
     """Build melspectrogram basis.
@@ -59,7 +60,7 @@ def _exp(x, base):
     return np.exp(x)
 
 
-def amp_to_db(*, x: np.ndarray, gain: float = 1, base: int = 10, **kwargs) -> np.ndarray:
+def amp_to_db(*, x: np.ndarray, gain: float = 1, base: float = 10, **kwargs) -> np.ndarray:
     """Convert amplitude values to decibels.
 
     Args:
@@ -176,8 +177,8 @@ def stft(
     *,
     y: np.ndarray,
     fft_size: int,
-    hop_length: Optional[int] = None,
-    win_length: Optional[int] = None,
+    hop_length: int | None = None,
+    win_length: int | None = None,
     pad_mode: str = "reflect",
     window: str = "hann",
     center: bool = True,
@@ -204,8 +205,8 @@ def stft(
 def istft(
     *,
     y: np.ndarray,
-    hop_length: Optional[int] = None,
-    win_length: Optional[int] = None,
+    hop_length: int | None = None,
+    win_length: int | None = None,
     window: str = "hann",
     center: bool = True,
     **kwargs,
@@ -247,8 +248,8 @@ def compute_stft_paddings(*, x: np.ndarray, hop_length: int, pad_two_sides: bool
 def compute_f0(
     *,
     x: np.ndarray,
-    pitch_fmax: Optional[float] = None,
-    pitch_fmin: Optional[float] = None,
+    pitch_fmax: float | None = None,
+    pitch_fmin: float | None = None,
     hop_length: int,
     win_length: int,
     sample_rate: int,
@@ -406,7 +407,9 @@ def rms_volume_norm(*, x: np.ndarray, db_level: float = -27.0, **kwargs) -> np.n
     return rms_norm(wav=x, db_level=db_level)
 
 
-def load_wav(*, filename: str, sample_rate: Optional[int] = None, resample: bool = False, **kwargs) -> np.ndarray:
+def load_wav(
+    *, filename: str | os.PathLike[Any], sample_rate: int | None = None, resample: bool = False, **kwargs
+) -> np.ndarray:
     """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
 
     Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
@@ -434,7 +437,7 @@ def load_wav(*, filename: str, sample_rate: Optional[int] = None, resample: bool
 def save_wav(
     *,
     wav: np.ndarray,
-    path: str,
+    path: str | os.PathLike[Any],
     sample_rate: int,
     pipe_out=None,
     do_rms_norm: bool = False,
diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py
index 1d8fed8e39..55b8575aa4 100644
--- a/TTS/utils/audio/processor.py
+++ b/TTS/utils/audio/processor.py
@@ -1,5 +1,6 @@
 import logging
-from typing import Optional
+import os
+from typing import Any
 
 import librosa
 import numpy as np
@@ -221,9 +222,9 @@ def __init__(
             self.hop_length = hop_length
             self.win_length = win_length
         assert min_level_db != 0.0, " [!] min_level_db is 0"
-        assert (
-            self.win_length <= self.fft_size
-        ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}"
+        assert self.win_length <= self.fft_size, (
+            f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}"
+        )
         members = vars(self)
         logger.info("Setting up Audio Processor...")
         for key, value in members.items():
@@ -282,7 +283,9 @@ def normalize(self, S: np.ndarray) -> np.ndarray:
                 S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
                 if self.clip_norm:
                     S_norm = np.clip(
-                        S_norm, -self.max_norm, self.max_norm  # pylint: disable=invalid-unary-operand-type
+                        S_norm,
+                        -self.max_norm,  # pylint: disable=invalid-unary-operand-type
+                        self.max_norm,
                     )
                 return S_norm
             S_norm = self.max_norm * S_norm
@@ -317,7 +320,9 @@ def denormalize(self, S: np.ndarray) -> np.ndarray:
             if self.symmetric_norm:
                 if self.clip_norm:
                     S_denorm = np.clip(
-                        S_denorm, -self.max_norm, self.max_norm  # pylint: disable=invalid-unary-operand-type
+                        S_denorm,
+                        -self.max_norm,  # pylint: disable=invalid-unary-operand-type
+                        self.max_norm,
                     )
                 S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
                 return S_denorm + self.ref_level_db
@@ -350,9 +355,9 @@ def load_stats(self, stats_path: str) -> tuple[np.array, np.array, np.array, np.
             if key in skip_parameters:
                 continue
             if key not in ["sample_rate", "trim_db"]:
-                assert (
-                    stats_config[key] == self.__dict__[key]
-                ), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}"
+                assert stats_config[key] == self.__dict__[key], (
+                    f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}"
+                )
         return mel_mean, mel_std, linear_mean, linear_std, stats_config
 
     # pylint: disable=attribute-defined-outside-init
@@ -548,7 +553,7 @@ def sound_norm(x: np.ndarray) -> np.ndarray:
         return volume_norm(x=x)
 
     ### save and load ###
-    def load_wav(self, filename: str, sr: Optional[int] = None) -> np.ndarray:
+    def load_wav(self, filename: str | os.PathLike[Any], sr: int | None = None) -> np.ndarray:
         """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
 
         Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
@@ -575,7 +580,7 @@ def load_wav(self, filename: str, sr: Optional[int] = None) -> np.ndarray:
             x = rms_volume_norm(x=x, db_level=self.db_level)
         return x
 
-    def save_wav(self, wav: np.ndarray, path: str, sr: Optional[int] = None, pipe_out=None) -> None:
+    def save_wav(self, wav: np.ndarray, path: str | os.PathLike[Any], sr: int | None = None, pipe_out=None) -> None:
         """Save a waveform to a file using Scipy.
 
         Args:
diff --git a/TTS/utils/audio/torch_transforms.py b/TTS/utils/audio/torch_transforms.py
index 632969c51a..59bb23cc4f 100644
--- a/TTS/utils/audio/torch_transforms.py
+++ b/TTS/utils/audio/torch_transforms.py
@@ -1,7 +1,113 @@
+import logging
+
 import librosa
 import torch
 from torch import nn
 
+logger = logging.getLogger(__name__)
+
+
+hann_window = {}
+mel_basis = {}
+
+
+def amp_to_db(x: torch.Tensor, *, spec_gain: float = 1.0, clip_val: float = 1e-5) -> torch.Tensor:
+    """Spectral normalization / dynamic range compression."""
+    return torch.log(torch.clamp(x, min=clip_val) * spec_gain)
+
+
+def db_to_amp(x: torch.Tensor, *, spec_gain: float = 1.0) -> torch.Tensor:
+    """Spectral denormalization / dynamic range decompression."""
+    return torch.exp(x) / spec_gain
+
+
+def wav_to_spec(y: torch.Tensor, n_fft: int, hop_length: int, win_length: int, *, center: bool = False) -> torch.Tensor:
+    """
+    Args Shapes:
+        - y : :math:`[B, 1, T]`
+
+    Return Shapes:
+        - spec : :math:`[B,C,T]`
+    """
+    y = y.squeeze(1)
+
+    if torch.min(y) < -1.0:
+        logger.info("min value is %.3f", torch.min(y))
+    if torch.max(y) > 1.0:
+        logger.info("max value is %.3f", torch.max(y))
+
+    global hann_window
+    wnsize_dtype_device = f"{win_length}_{y.dtype}_{y.device}"
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device)
+
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+
+    spec = torch.view_as_real(
+        torch.stft(
+            y,
+            n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=hann_window[wnsize_dtype_device],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+    )
+
+    return torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+
+def spec_to_mel(
+    spec: torch.Tensor, n_fft: int, num_mels: int, sample_rate: int, fmin: float, fmax: float
+) -> torch.Tensor:
+    """
+    Args Shapes:
+        - spec : :math:`[B,C,T]`
+
+    Return Shapes:
+        - mel : :math:`[B,C,T]`
+    """
+    global mel_basis
+    fmax_dtype_device = f"{n_fft}_{fmax}_{spec.dtype}_{spec.device}"
+    if fmax_dtype_device not in mel_basis:
+        # TODO: switch librosa to torchaudio
+        mel = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+    mel = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    return amp_to_db(mel)
+
+
+def wav_to_mel(
+    y: torch.Tensor,
+    n_fft: int,
+    num_mels: int,
+    sample_rate: int,
+    hop_length: int,
+    win_length: int,
+    fmin: float,
+    fmax: float,
+    *,
+    center: bool = False,
+) -> torch.Tensor:
+    """
+    Args Shapes:
+        - y : :math:`[B, 1, T]`
+
+    Return Shapes:
+        - spec : :math:`[B,C,T]`
+    """
+    spec = wav_to_spec(y, n_fft, hop_length, win_length, center=center)
+    return spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax)
+
 
 class TorchSTFT(nn.Module):  # pylint: disable=abstract-method
     """Some of the audio processing funtions using Torch for faster batch processing.
@@ -157,11 +263,3 @@ def _build_mel_basis(self):
             norm=self.mel_norm,
         )
         self.mel_basis = torch.from_numpy(mel_basis).float()
-
-    @staticmethod
-    def _amp_to_db(x, spec_gain=1.0):
-        return torch.log(torch.clamp(x, min=1e-5) * spec_gain)
-
-    @staticmethod
-    def _db_to_amp(x, spec_gain=1.0):
-        return torch.exp(x) / spec_gain
diff --git a/TTS/utils/callbacks.py b/TTS/utils/callbacks.py
deleted file mode 100644
index 511d215c65..0000000000
--- a/TTS/utils/callbacks.py
+++ /dev/null
@@ -1,105 +0,0 @@
-class TrainerCallback:
-    @staticmethod
-    def on_init_start(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_init_start"):
-                trainer.model.module.on_init_start(trainer)
-        else:
-            if hasattr(trainer.model, "on_init_start"):
-                trainer.model.on_init_start(trainer)
-
-        if hasattr(trainer.criterion, "on_init_start"):
-            trainer.criterion.on_init_start(trainer)
-
-        if hasattr(trainer.optimizer, "on_init_start"):
-            trainer.optimizer.on_init_start(trainer)
-
-    @staticmethod
-    def on_init_end(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_init_end"):
-                trainer.model.module.on_init_end(trainer)
-        else:
-            if hasattr(trainer.model, "on_init_end"):
-                trainer.model.on_init_end(trainer)
-
-        if hasattr(trainer.criterion, "on_init_end"):
-            trainer.criterion.on_init_end(trainer)
-
-        if hasattr(trainer.optimizer, "on_init_end"):
-            trainer.optimizer.on_init_end(trainer)
-
-    @staticmethod
-    def on_epoch_start(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_epoch_start"):
-                trainer.model.module.on_epoch_start(trainer)
-        else:
-            if hasattr(trainer.model, "on_epoch_start"):
-                trainer.model.on_epoch_start(trainer)
-
-        if hasattr(trainer.criterion, "on_epoch_start"):
-            trainer.criterion.on_epoch_start(trainer)
-
-        if hasattr(trainer.optimizer, "on_epoch_start"):
-            trainer.optimizer.on_epoch_start(trainer)
-
-    @staticmethod
-    def on_epoch_end(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_epoch_end"):
-                trainer.model.module.on_epoch_end(trainer)
-        else:
-            if hasattr(trainer.model, "on_epoch_end"):
-                trainer.model.on_epoch_end(trainer)
-
-        if hasattr(trainer.criterion, "on_epoch_end"):
-            trainer.criterion.on_epoch_end(trainer)
-
-        if hasattr(trainer.optimizer, "on_epoch_end"):
-            trainer.optimizer.on_epoch_end(trainer)
-
-    @staticmethod
-    def on_train_step_start(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_train_step_start"):
-                trainer.model.module.on_train_step_start(trainer)
-        else:
-            if hasattr(trainer.model, "on_train_step_start"):
-                trainer.model.on_train_step_start(trainer)
-
-        if hasattr(trainer.criterion, "on_train_step_start"):
-            trainer.criterion.on_train_step_start(trainer)
-
-        if hasattr(trainer.optimizer, "on_train_step_start"):
-            trainer.optimizer.on_train_step_start(trainer)
-
-    @staticmethod
-    def on_train_step_end(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_train_step_end"):
-                trainer.model.module.on_train_step_end(trainer)
-        else:
-            if hasattr(trainer.model, "on_train_step_end"):
-                trainer.model.on_train_step_end(trainer)
-
-        if hasattr(trainer.criterion, "on_train_step_end"):
-            trainer.criterion.on_train_step_end(trainer)
-
-        if hasattr(trainer.optimizer, "on_train_step_end"):
-            trainer.optimizer.on_train_step_end(trainer)
-
-    @staticmethod
-    def on_keyboard_interrupt(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_keyboard_interrupt"):
-                trainer.model.module.on_keyboard_interrupt(trainer)
-        else:
-            if hasattr(trainer.model, "on_keyboard_interrupt"):
-                trainer.model.on_keyboard_interrupt(trainer)
-
-        if hasattr(trainer.criterion, "on_keyboard_interrupt"):
-            trainer.criterion.on_keyboard_interrupt(trainer)
-
-        if hasattr(trainer.optimizer, "on_keyboard_interrupt"):
-            trainer.optimizer.on_keyboard_interrupt(trainer)
diff --git a/TTS/utils/capacitron_optimizer.py b/TTS/utils/capacitron_optimizer.py
index 7206ffd508..01f303f98d 100644
--- a/TTS/utils/capacitron_optimizer.py
+++ b/TTS/utils/capacitron_optimizer.py
@@ -1,4 +1,4 @@
-from typing import Generator
+from collections.abc import Generator
 
 from trainer.trainer_utils import get_optimizer
 
diff --git a/TTS/utils/download.py b/TTS/utils/download.py
index e94b1d68c8..75ef9164f6 100644
--- a/TTS/utils/download.py
+++ b/TTS/utils/download.py
@@ -7,8 +7,9 @@
 import urllib
 import urllib.request
 import zipfile
+from collections.abc import Iterable
 from os.path import expanduser
-from typing import Any, Iterable, List, Optional
+from typing import Any
 
 from torch.utils.model_zoo import tqdm
 
@@ -16,7 +17,7 @@
 
 
 def stream_url(
-    url: str, start_byte: Optional[int] = None, block_size: int = 32 * 1024, progress_bar: bool = True
+    url: str, start_byte: int | None = None, block_size: int = 32 * 1024, progress_bar: bool = True
 ) -> Iterable:
     """Stream url by chunk
 
@@ -36,7 +37,7 @@ def stream_url(
 
     req = urllib.request.Request(url)
     if start_byte:
-        req.headers["Range"] = "bytes={}-".format(start_byte)
+        req.headers["Range"] = f"bytes={start_byte}-"
 
     with (
         urllib.request.urlopen(req) as upointer,
@@ -61,8 +62,8 @@ def stream_url(
 def download_url(
     url: str,
     download_folder: str,
-    filename: Optional[str] = None,
-    hash_value: Optional[str] = None,
+    filename: str | None = None,
+    hash_value: str | None = None,
     hash_type: str = "sha256",
     progress_bar: bool = True,
     resume: bool = False,
@@ -88,10 +89,10 @@ def download_url(
     filepath = os.path.join(download_folder, filename)
     if resume and os.path.exists(filepath):
         mode = "ab"
-        local_size: Optional[int] = os.path.getsize(filepath)
+        local_size: int | None = os.path.getsize(filepath)
 
     elif not resume and os.path.exists(filepath):
-        raise RuntimeError("{} already exists. Delete the file manually and retry.".format(filepath))
+        raise RuntimeError(f"{filepath} already exists. Delete the file manually and retry.")
     else:
         mode = "wb"
         local_size = None
@@ -100,7 +101,7 @@ def download_url(
         with open(filepath, "rb") as file_obj:
             if validate_file(file_obj, hash_value, hash_type):
                 return
-        raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath))
+        raise RuntimeError(f"The hash of {filepath} does not match. Delete the file manually and retry.")
 
     with open(filepath, mode) as fpointer:
         for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar):
@@ -108,7 +109,7 @@ def download_url(
 
     with open(filepath, "rb") as file_obj:
         if hash_value and not validate_file(file_obj, hash_value, hash_type):
-            raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath))
+            raise RuntimeError(f"The hash of {filepath} does not match. Delete the file manually and retry.")
 
 
 def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") -> bool:
@@ -140,7 +141,7 @@ def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") ->
     return hash_func.hexdigest() == hash_value
 
 
-def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bool = False) -> List[str]:
+def extract_archive(from_path: str, to_path: str | None = None, overwrite: bool = False) -> list[str]:
     """Extract archive.
     Args:
         from_path (str): the path of the archive.
diff --git a/TTS/utils/downloaders.py b/TTS/utils/downloaders.py
index 8705873982..c06c2649ad 100644
--- a/TTS/utils/downloaders.py
+++ b/TTS/utils/downloaders.py
@@ -1,6 +1,5 @@
 import logging
 import os
-from typing import Optional
 
 from TTS.utils.download import download_kaggle_dataset, download_url, extract_archive
 
@@ -21,7 +20,7 @@ def download_ljspeech(path: str):
     extract_archive(archive)
 
 
-def download_vctk(path: str, use_kaggle: Optional[bool] = False):
+def download_vctk(path: str, use_kaggle: bool | None = False):
     """Download and extract VCTK dataset.
 
     Args:
@@ -49,7 +48,7 @@ def download_tweb(path: str):
     download_kaggle_dataset("bryanpark/the-world-english-bible-speech-dataset", "TWEB", path)
 
 
-def download_libri_tts(path: str, subset: Optional[str] = "all"):
+def download_libri_tts(path: str, subset: str | None = "all"):
     """Download and extract libri tts dataset.
 
     Args:
diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py
index 3ee285232f..e1df6f6ed4 100644
--- a/TTS/utils/generic_utils.py
+++ b/TTS/utils/generic_utils.py
@@ -1,22 +1,37 @@
-# -*- coding: utf-8 -*-
 import datetime
 import importlib
 import logging
+import os
 import re
+from collections.abc import Callable
 from pathlib import Path
-from typing import Dict, Optional
+from typing import Any, TextIO, TypeVar
 
 import torch
 from packaging.version import Version
+from typing_extensions import TypeIs
 
 logger = logging.getLogger(__name__)
 
+_T = TypeVar("_T")
+
+
+def exists(val: _T | None) -> TypeIs[_T]:
+    return val is not None
+
+
+def default(val: _T | None, d: _T | Callable[[], _T]) -> _T:
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+
 
 def to_camel(text):
     text = text.capitalize()
     text = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
     text = text.replace("Tts", "TTS")
     text = text.replace("vc", "VC")
+    text = text.replace("Knn", "KNN")
     return text
 
 
@@ -54,26 +69,7 @@ def get_import_path(obj: object) -> str:
     return ".".join([type(obj).__module__, type(obj).__name__])
 
 
-def set_init_dict(model_dict, checkpoint_state, c):
-    # Partial initialization: if there is a mismatch with new and old layer, it is skipped.
-    for k, v in checkpoint_state.items():
-        if k not in model_dict:
-            logger.warning("Layer missing in the model finition %s", k)
-    # 1. filter out unnecessary keys
-    pretrained_dict = {k: v for k, v in checkpoint_state.items() if k in model_dict}
-    # 2. filter out different size layers
-    pretrained_dict = {k: v for k, v in pretrained_dict.items() if v.numel() == model_dict[k].numel()}
-    # 3. skip reinit layers
-    if c.has("reinit_layers") and c.reinit_layers is not None:
-        for reinit_layer_name in c.reinit_layers:
-            pretrained_dict = {k: v for k, v in pretrained_dict.items() if reinit_layer_name not in k}
-    # 4. overwrite entries in the existing state dict
-    model_dict.update(pretrained_dict)
-    logger.info("%d / %d layers are restored.", len(pretrained_dict), len(model_dict))
-    return model_dict
-
-
-def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict:
+def format_aux_input(def_args: dict, kwargs: dict) -> dict:
     """Format kwargs to hande auxilary inputs to models.
 
     Args:
@@ -84,9 +80,9 @@ def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict:
         Dict: arguments with formatted auxilary inputs.
     """
     kwargs = kwargs.copy()
-    for name in def_args:
+    for name, arg in def_args.items():
         if name not in kwargs or kwargs[name] is None:
-            kwargs[name] = def_args[name]
+            kwargs[name] = arg
     return kwargs
 
 
@@ -112,26 +108,35 @@ def setup_logger(
     logger_name: str,
     level: int = logging.INFO,
     *,
-    formatter: Optional[logging.Formatter] = None,
-    screen: bool = False,
-    tofile: bool = False,
-    log_dir: str = "logs",
+    formatter: logging.Formatter | None = None,
+    stream: TextIO | None = None,
+    log_dir: str | os.PathLike[Any] | None = None,
     log_name: str = "log",
 ) -> None:
+    """Set up a logger.
+
+    Args:
+        logger_name: Name of the logger to set up
+        level: Logging level
+        formatter: Formatter for the logger
+        stream: Add a StreamHandler for the given stream, e.g. sys.stderr or sys.stdout
+        log_dir: Folder to write the log file (no file created if None)
+        log_name: Prefix of the log file name
+    """
     lg = logging.getLogger(logger_name)
     if formatter is None:
         formatter = logging.Formatter(
             "%(asctime)s.%(msecs)03d - %(levelname)-8s - %(name)s: %(message)s", datefmt="%y-%m-%d %H:%M:%S"
         )
     lg.setLevel(level)
-    if tofile:
+    if log_dir is not None:
         Path(log_dir).mkdir(exist_ok=True, parents=True)
         log_file = Path(log_dir) / f"{log_name}_{get_timestamp()}.log"
         fh = logging.FileHandler(log_file, mode="w")
         fh.setFormatter(formatter)
         lg.addHandler(fh)
-    if screen:
-        sh = logging.StreamHandler()
+    if stream is not None:
+        sh = logging.StreamHandler(stream)
         sh.setFormatter(formatter)
         lg.addHandler(sh)
 
@@ -139,3 +144,8 @@ def setup_logger(
 def is_pytorch_at_least_2_4() -> bool:
     """Check if the installed Pytorch version is 2.4 or higher."""
     return Version(torch.__version__) >= Version("2.4")
+
+
+def optional_to_str(x: Any | None) -> str:
+    """Convert input to string, using empty string if input is None."""
+    return "" if x is None else str(x)
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index bd445b3a2f..20d6ab226b 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -6,17 +6,36 @@
 import zipfile
 from pathlib import Path
 from shutil import copyfile, rmtree
-from typing import Dict, Tuple
+from typing import Any, TypedDict
 
 import fsspec
 import requests
 from tqdm import tqdm
 from trainer.io import get_user_data_dir
+from typing_extensions import Required
 
 from TTS.config import load_config, read_json_with_comments
+from TTS.vc.configs.knnvc_config import KNNVCConfig
 
 logger = logging.getLogger(__name__)
 
+
+class ModelItem(TypedDict, total=False):
+    model_name: Required[str]
+    model_type: Required[str]
+    description: str
+    license: str
+    author: str
+    contact: str
+    commit: str | None
+    model_hash: str
+    tos_required: bool
+    default_vocoder: str | None
+    model_url: str | list[str]
+    github_rls_url: str | list[str]
+    hf_url: list[str]
+
+
 LICENSE_URLS = {
     "cc by-nc-nd 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/",
     "mpl": "https://www.mozilla.org/en-US/MPL/2.0/",
@@ -30,7 +49,7 @@
 }
 
 
-class ModelManager(object):
+class ModelManager:
     tqdm_progress = None
     """Manage TTS models defined in .models.json.
     It provides an interface to list and download
@@ -40,19 +59,24 @@ class ModelManager(object):
     home path.
 
     Args:
-        models_file (str): path to .model.json file. Defaults to None.
-        output_prefix (str): prefix to `tts` to download models. Defaults to None
+        models_file (str or Path): path to .model.json file. Defaults to None.
+        output_prefix (str or Path): prefix to `tts` to download models. Defaults to None
         progress_bar (bool): print a progress bar when donwloading a file. Defaults to False.
     """
 
-    def __init__(self, models_file=None, output_prefix=None, progress_bar=False):
+    def __init__(
+        self,
+        models_file: str | os.PathLike[Any] | None = None,
+        output_prefix: str | os.PathLike[Any] | None = None,
+        progress_bar: bool = False,
+    ) -> None:
         super().__init__()
         self.progress_bar = progress_bar
         if output_prefix is None:
             self.output_prefix = get_user_data_dir("tts")
         else:
-            self.output_prefix = os.path.join(output_prefix, "tts")
-        self.models_dict = None
+            self.output_prefix = Path(output_prefix) / "tts"
+        self.models_dict = {}
         if models_file is not None:
             self.read_models_file(models_file)
         else:
@@ -60,7 +84,7 @@ def __init__(self, models_file=None, output_prefix=None, progress_bar=False):
             path = Path(__file__).parent / "../.models.json"
             self.read_models_file(path)
 
-    def read_models_file(self, file_path):
+    def read_models_file(self, file_path: str | os.PathLike[Any]) -> None:
         """Read .models.json as a dict
 
         Args:
@@ -68,7 +92,7 @@ def read_models_file(self, file_path):
         """
         self.models_dict = read_json_with_comments(file_path)
 
-    def _list_models(self, model_type, model_count=0):
+    def _list_models(self, model_type: str, model_count: int = 0) -> list[str]:
         logger.info("")
         logger.info("Name format: type/language/dataset/model")
         model_list = []
@@ -83,21 +107,23 @@ def _list_models(self, model_type, model_count=0):
                     model_count += 1
         return model_list
 
-    def _list_for_model_type(self, model_type):
+    def _list_for_model_type(self, model_type: str) -> list[str]:
         models_name_list = []
         model_count = 1
         models_name_list.extend(self._list_models(model_type, model_count))
         return models_name_list
 
-    def list_models(self):
+    def list_models(self) -> list[str]:
         models_name_list = []
         model_count = 1
         for model_type in self.models_dict:
             model_list = self._list_models(model_type, model_count)
             models_name_list.extend(model_list)
+        logger.info("")
+        logger.info("Path to downloaded models: %s", self.output_prefix)
         return models_name_list
 
-    def log_model_details(self, model_type, lang, dataset, model):
+    def log_model_details(self, model_type: str, lang: str, dataset: str, model: str) -> None:
         logger.info("Model type: %s", model_type)
         logger.info("Language supported: %s", lang)
         logger.info("Dataset used: %s", dataset)
@@ -112,7 +138,7 @@ def log_model_details(self, model_type, lang, dataset, model):
                 self.models_dict[model_type][lang][dataset][model]["default_vocoder"],
             )
 
-    def model_info_by_idx(self, model_query):
+    def model_info_by_idx(self, model_query: str) -> None:
         """Print the description of the model from .models.json file using model_query_idx
 
         Args:
@@ -144,7 +170,7 @@ def model_info_by_idx(self, model_query):
             model_type, lang, dataset, model = model_name_list[model_query_idx - 1].split("/")
             self.log_model_details(model_type, lang, dataset, model)
 
-    def model_info_by_full_name(self, model_query_name):
+    def model_info_by_full_name(self, model_query_name: str) -> None:
         """Print the description of the model from .models.json file using model_full_name
 
         Args:
@@ -165,35 +191,35 @@ def model_info_by_full_name(self, model_query_name):
             return
         self.log_model_details(model_type, lang, dataset, model)
 
-    def list_tts_models(self):
+    def list_tts_models(self) -> list[str]:
         """Print all `TTS` models and return a list of model names
 
         Format is `language/dataset/model`
         """
         return self._list_for_model_type("tts_models")
 
-    def list_vocoder_models(self):
+    def list_vocoder_models(self) -> list[str]:
         """Print all the `vocoder` models and return a list of model names
 
         Format is `language/dataset/model`
         """
         return self._list_for_model_type("vocoder_models")
 
-    def list_vc_models(self):
+    def list_vc_models(self) -> list[str]:
         """Print all the voice conversion models and return a list of model names
 
         Format is `language/dataset/model`
         """
         return self._list_for_model_type("voice_conversion_models")
 
-    def list_langs(self):
+    def list_langs(self) -> None:
         """Print all the available languages"""
         logger.info("Name format: type/language")
         for model_type in self.models_dict:
             for lang in self.models_dict[model_type]:
                 logger.info("  %s/%s", model_type, lang)
 
-    def list_datasets(self):
+    def list_datasets(self) -> None:
         """Print all the datasets"""
         logger.info("Name format: type/language/dataset")
         for model_type in self.models_dict:
@@ -202,7 +228,7 @@ def list_datasets(self):
                     logger.info("  %s/%s/%s", model_type, lang, dataset)
 
     @staticmethod
-    def print_model_license(model_item: Dict):
+    def print_model_license(model_item: ModelItem) -> None:
         """Print the license of a model
 
         Args:
@@ -217,49 +243,49 @@ def print_model_license(model_item: Dict):
         else:
             logger.info("Model's license - No license information available")
 
-    def _download_github_model(self, model_item: Dict, output_path: str):
+    def _download_github_model(self, model_item: ModelItem, output_path: Path) -> None:
         if isinstance(model_item["github_rls_url"], list):
             self._download_model_files(model_item["github_rls_url"], output_path, self.progress_bar)
         else:
             self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar)
 
-    def _download_hf_model(self, model_item: Dict, output_path: str):
+    def _download_hf_model(self, model_item: ModelItem, output_path: Path) -> None:
         if isinstance(model_item["hf_url"], list):
             self._download_model_files(model_item["hf_url"], output_path, self.progress_bar)
         else:
             self._download_zip_file(model_item["hf_url"], output_path, self.progress_bar)
 
-    def download_fairseq_model(self, model_name, output_path):
+    def download_fairseq_model(self, model_name: str, output_path: Path) -> None:
         URI_PREFIX = "https://dl.fbaipublicfiles.com/mms/tts/"
         _, lang, _, _ = model_name.split("/")
         model_download_uri = os.path.join(URI_PREFIX, f"{lang}.tar.gz")
         self._download_tar_file(model_download_uri, output_path, self.progress_bar)
 
     @staticmethod
-    def set_model_url(model_item: Dict):
-        model_item["model_url"] = None
+    def set_model_url(model_item: ModelItem) -> ModelItem:
+        model_item["model_url"] = ""
         if "github_rls_url" in model_item:
             model_item["model_url"] = model_item["github_rls_url"]
         elif "hf_url" in model_item:
             model_item["model_url"] = model_item["hf_url"]
-        elif "fairseq" in model_item["model_name"]:
+        elif "fairseq" in model_item.get("model_name", ""):
             model_item["model_url"] = "https://dl.fbaipublicfiles.com/mms/tts/"
-        elif "xtts" in model_item["model_name"]:
+        elif "xtts" in model_item.get("model_name", ""):
             model_item["model_url"] = "https://huggingface.co/coqui/"
         return model_item
 
-    def _set_model_item(self, model_name):
+    def _set_model_item(self, model_name: str) -> tuple[ModelItem, str, str, str | None]:
         # fetch model info from the dict
         if "fairseq" in model_name:
             model_type, lang, dataset, model = model_name.split("/")
-            model_item = {
+            model_item: ModelItem = {
+                "model_name": model_name,
                 "model_type": "tts_models",
                 "license": "CC BY-NC 4.0",
                 "default_vocoder": None,
                 "author": "fairseq",
                 "description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.",
             }
-            model_item["model_name"] = model_name
         elif "xtts" in model_name and len(model_name.split("/")) != 4:
             # loading xtts models with only model name (e.g. xtts_v2.0.2)
             # check model name has the version number with regex
@@ -273,6 +299,8 @@ def _set_model_item(self, model_name):
             dataset = "multi-dataset"
             model = model_name
             model_item = {
+                "model_name": model_name,
+                "model_type": model_type,
                 "default_vocoder": None,
                 "license": "CPML",
                 "contact": "info@coqui.ai",
@@ -297,9 +325,9 @@ def _set_model_item(self, model_name):
         return model_item, model_full_name, model, md5hash
 
     @staticmethod
-    def ask_tos(model_full_path):
+    def ask_tos(model_full_path: Path) -> bool:
         """Ask the user to agree to the terms of service"""
-        tos_path = os.path.join(model_full_path, "tos_agreed.txt")
+        tos_path = model_full_path / "tos_agreed.txt"
         print(" > You must confirm the following:")
         print(' | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"')
         print(' | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]')
@@ -311,7 +339,7 @@ def ask_tos(model_full_path):
         return False
 
     @staticmethod
-    def tos_agreed(model_item, model_full_path):
+    def tos_agreed(model_item: ModelItem, model_full_path: Path) -> bool:
         """Check if the user has agreed to the terms of service"""
         if "tos_required" in model_item and model_item["tos_required"]:
             tos_path = os.path.join(model_full_path, "tos_agreed.txt")
@@ -320,12 +348,12 @@ def tos_agreed(model_item, model_full_path):
             return False
         return True
 
-    def create_dir_and_download_model(self, model_name, model_item, output_path):
-        os.makedirs(output_path, exist_ok=True)
+    def create_dir_and_download_model(self, model_name: str, model_item: ModelItem, output_path: Path) -> None:
+        output_path.mkdir(exist_ok=True, parents=True)
         # handle TOS
         if not self.tos_agreed(model_item, output_path):
             if not self.ask_tos(output_path):
-                os.rmdir(output_path)
+                output_path.rmdir()
                 raise Exception(" [!] You must agree to the terms of service to use this model.")
         logger.info("Downloading model to %s", output_path)
         try:
@@ -340,9 +368,12 @@ def create_dir_and_download_model(self, model_name, model_item, output_path):
             logger.exception("Failed to download the model file to %s", output_path)
             rmtree(output_path)
             raise e
+        checkpoints = list(Path(output_path).glob("*.pt*"))
+        if len(checkpoints) == 1:
+            checkpoints[0].rename(checkpoints[0].parent / "model.pth")
         self.print_model_license(model_item=model_item)
 
-    def check_if_configs_are_equal(self, model_name, model_item, output_path):
+    def check_if_configs_are_equal(self, model_name: str, model_item: ModelItem, output_path: Path) -> None:
         with fsspec.open(self._find_files(output_path)[1], "r", encoding="utf-8") as f:
             config_local = json.load(f)
         remote_url = None
@@ -358,7 +389,7 @@ def check_if_configs_are_equal(self, model_name, model_item, output_path):
             logger.info("%s is already downloaded however it has been changed. Redownloading it...", model_name)
             self.create_dir_and_download_model(model_name, model_item, output_path)
 
-    def download_model(self, model_name):
+    def download_model(self, model_name: str) -> tuple[Path, Path | None, ModelItem]:
         """Download model files given the full model name.
         Model name is in the format
             'type/language/dataset/model'
@@ -374,12 +405,12 @@ def download_model(self, model_name):
         """
         model_item, model_full_name, model, md5sum = self._set_model_item(model_name)
         # set the model specific output path
-        output_path = os.path.join(self.output_prefix, model_full_name)
-        if os.path.exists(output_path):
+        output_path = Path(self.output_prefix) / model_full_name
+        if output_path.is_dir():
             if md5sum is not None:
-                md5sum_file = os.path.join(output_path, "hash.md5")
-                if os.path.isfile(md5sum_file):
-                    with open(md5sum_file, mode="r") as f:
+                md5sum_file = output_path / "hash.md5"
+                if md5sum_file.is_file():
+                    with md5sum_file.open() as f:
                         if not f.read() == md5sum:
                             logger.info("%s has been updated, clearing model cache...", model_name)
                             self.create_dir_and_download_model(model_name, model_item, output_path)
@@ -404,15 +435,20 @@ def download_model(self, model_name):
         output_model_path = output_path
         output_config_path = None
         if (
-            model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name and "xtts" not in model_name
+            model not in ["tortoise-v2", "bark", "knnvc"] and "fairseq" not in model_name and "xtts" not in model_name
         ):  # TODO:This is stupid but don't care for now.
             output_model_path, output_config_path = self._find_files(output_path)
+        else:
+            output_config_path = output_model_path / "config.json"
+        if model == "knnvc" and not output_config_path.exists():
+            knnvc_config = KNNVCConfig()
+            knnvc_config.save_json(output_config_path)
         # update paths in the config.json
         self._update_paths(output_path, output_config_path)
         return output_model_path, output_config_path, model_item
 
     @staticmethod
-    def _find_files(output_path: str) -> Tuple[str, str]:
+    def _find_files(output_path: Path) -> tuple[Path, Path]:
         """Find the model and config files in the output path
 
         Args:
@@ -423,11 +459,11 @@ def _find_files(output_path: str) -> Tuple[str, str]:
         """
         model_file = None
         config_file = None
-        for file_name in os.listdir(output_path):
-            if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth"]:
-                model_file = os.path.join(output_path, file_name)
-            elif file_name == "config.json":
-                config_file = os.path.join(output_path, file_name)
+        for f in output_path.iterdir():
+            if f.name in ["model_file.pth", "model_file.pth.tar", "model.pth", "checkpoint.pth"]:
+                model_file = f
+            elif f.name == "config.json":
+                config_file = f
         if model_file is None:
             raise ValueError(" [!] Model file not found in the output path")
         if config_file is None:
@@ -435,7 +471,7 @@ def _find_files(output_path: str) -> Tuple[str, str]:
         return model_file, config_file
 
     @staticmethod
-    def _find_speaker_encoder(output_path: str) -> str:
+    def _find_speaker_encoder(output_path: Path) -> Path | None:
         """Find the speaker encoder file in the output path
 
         Args:
@@ -445,24 +481,24 @@ def _find_speaker_encoder(output_path: str) -> str:
             str: path to the speaker encoder file
         """
         speaker_encoder_file = None
-        for file_name in os.listdir(output_path):
-            if file_name in ["model_se.pth", "model_se.pth.tar"]:
-                speaker_encoder_file = os.path.join(output_path, file_name)
+        for f in output_path.iterdir():
+            if f.name in ["model_se.pth", "model_se.pth.tar"]:
+                speaker_encoder_file = f
         return speaker_encoder_file
 
-    def _update_paths(self, output_path: str, config_path: str) -> None:
+    def _update_paths(self, output_path: Path, config_path: Path) -> None:
         """Update paths for certain files in config.json after download.
 
         Args:
             output_path (str): local path the model is downloaded to.
             config_path (str): local config.json path.
         """
-        output_stats_path = os.path.join(output_path, "scale_stats.npy")
-        output_d_vector_file_path = os.path.join(output_path, "speakers.json")
-        output_d_vector_file_pth_path = os.path.join(output_path, "speakers.pth")
-        output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json")
-        output_speaker_ids_file_pth_path = os.path.join(output_path, "speaker_ids.pth")
-        speaker_encoder_config_path = os.path.join(output_path, "config_se.json")
+        output_stats_path = output_path / "scale_stats.npy"
+        output_d_vector_file_path = output_path / "speakers.json"
+        output_d_vector_file_pth_path = output_path / "speakers.pth"
+        output_speaker_ids_file_path = output_path / "speaker_ids.json"
+        output_speaker_ids_file_pth_path = output_path / "speaker_ids.pth"
+        speaker_encoder_config_path = output_path / "config_se.json"
         speaker_encoder_model_path = self._find_speaker_encoder(output_path)
 
         # update the scale_path.npy file path in the model config.json
@@ -487,10 +523,10 @@ def _update_paths(self, output_path: str, config_path: str) -> None:
         self._update_path("model_args.speaker_encoder_config_path", speaker_encoder_config_path, config_path)
 
     @staticmethod
-    def _update_path(field_name, new_path, config_path):
+    def _update_path(field_name: str, new_path: Path | None, config_path: Path) -> None:
         """Update the path in the model config.json for the current environment after download"""
-        if new_path and os.path.exists(new_path):
-            config = load_config(config_path)
+        if new_path is not None and new_path.is_file():
+            config = load_config(str(config_path))
             field_names = field_name.split(".")
             if len(field_names) > 1:
                 # field name points to a sub-level field
@@ -515,7 +551,7 @@ def _update_path(field_name, new_path, config_path):
             config.save_json(config_path)
 
     @staticmethod
-    def _download_zip_file(file_url, output_folder, progress_bar):
+    def _download_zip_file(file_url: str, output_folder: Path, progress_bar: bool) -> None:
         """Download the github releases"""
         # download the file
         r = requests.get(file_url, stream=True)
@@ -525,7 +561,7 @@ def _download_zip_file(file_url, output_folder, progress_bar):
             block_size = 1024  # 1 Kibibyte
             if progress_bar:
                 ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
-            temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1])
+            temp_zip_name = output_folder / file_url.split("/")[-1]
             with open(temp_zip_name, "wb") as file:
                 for data in r.iter_content(block_size):
                     if progress_bar:
@@ -533,24 +569,24 @@ def _download_zip_file(file_url, output_folder, progress_bar):
                     file.write(data)
             with zipfile.ZipFile(temp_zip_name) as z:
                 z.extractall(output_folder)
-            os.remove(temp_zip_name)  # delete zip after extract
+            temp_zip_name.unlink()  # delete zip after extract
         except zipfile.BadZipFile:
             logger.exception("Bad zip file - %s", file_url)
             raise zipfile.BadZipFile  # pylint: disable=raise-missing-from
         # move the files to the outer path
         for file_path in z.namelist():
-            src_path = os.path.join(output_folder, file_path)
-            if os.path.isfile(src_path):
-                dst_path = os.path.join(output_folder, os.path.basename(file_path))
+            src_path = output_folder / file_path
+            if src_path.is_file():
+                dst_path = output_folder / os.path.basename(file_path)
                 if src_path != dst_path:
                     copyfile(src_path, dst_path)
         # remove redundant (hidden or not) folders
         for file_path in z.namelist():
-            if os.path.isdir(os.path.join(output_folder, file_path)):
-                rmtree(os.path.join(output_folder, file_path))
+            if (output_folder / file_path).is_dir():
+                rmtree(output_folder / file_path)
 
     @staticmethod
-    def _download_tar_file(file_url, output_folder, progress_bar):
+    def _download_tar_file(file_url: str, output_folder: Path, progress_bar: bool) -> None:
         """Download the github releases"""
         # download the file
         r = requests.get(file_url, stream=True)
@@ -560,7 +596,7 @@ def _download_tar_file(file_url, output_folder, progress_bar):
             block_size = 1024  # 1 Kibibyte
             if progress_bar:
                 ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
-            temp_tar_name = os.path.join(output_folder, file_url.split("/")[-1])
+            temp_tar_name = output_folder / file_url.split("/")[-1]
             with open(temp_tar_name, "wb") as file:
                 for data in r.iter_content(block_size):
                     if progress_bar:
@@ -569,43 +605,35 @@ def _download_tar_file(file_url, output_folder, progress_bar):
             with tarfile.open(temp_tar_name) as t:
                 t.extractall(output_folder)
                 tar_names = t.getnames()
-            os.remove(temp_tar_name)  # delete tar after extract
+            temp_tar_name.unlink()  # delete tar after extract
         except tarfile.ReadError:
             logger.exception("Bad tar file - %s", file_url)
             raise tarfile.ReadError  # pylint: disable=raise-missing-from
         # move the files to the outer path
-        for file_path in os.listdir(os.path.join(output_folder, tar_names[0])):
-            src_path = os.path.join(output_folder, tar_names[0], file_path)
-            dst_path = os.path.join(output_folder, os.path.basename(file_path))
+        for file_path in (output_folder / tar_names[0]).iterdir():
+            src_path = file_path
+            dst_path = output_folder / file_path.name
             if src_path != dst_path:
                 copyfile(src_path, dst_path)
         # remove the extracted folder
-        rmtree(os.path.join(output_folder, tar_names[0]))
+        rmtree(output_folder / tar_names[0])
 
     @staticmethod
-    def _download_model_files(file_urls, output_folder, progress_bar):
+    def _download_model_files(file_urls: list[str], output_folder: str | os.PathLike[Any], progress_bar: bool) -> None:
         """Download the github releases"""
+        output_folder = Path(output_folder)
         for file_url in file_urls:
             # download the file
             r = requests.get(file_url, stream=True)
             # extract the file
-            bease_filename = file_url.split("/")[-1]
-            temp_zip_name = os.path.join(output_folder, bease_filename)
+            base_filename = file_url.split("/")[-1]
+            file_path = output_folder / base_filename
             total_size_in_bytes = int(r.headers.get("content-length", 0))
             block_size = 1024  # 1 Kibibyte
-            with open(temp_zip_name, "wb") as file:
+            with open(file_path, "wb") as f:
                 if progress_bar:
                     ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
                 for data in r.iter_content(block_size):
                     if progress_bar:
                         ModelManager.tqdm_progress.update(len(data))
-                    file.write(data)
-
-    @staticmethod
-    def _check_dict_key(my_dict, key):
-        if key in my_dict.keys() and my_dict[key] is not None:
-            if not isinstance(key, str):
-                return True
-            if isinstance(key, str) and len(my_dict[key]) > 0:
-                return True
-        return False
+                    f.write(data)
diff --git a/TTS/utils/radam.py b/TTS/utils/radam.py
index cbd14990f3..b893d115c9 100644
--- a/TTS/utils/radam.py
+++ b/TTS/utils/radam.py
@@ -9,16 +9,16 @@
 class RAdam(Optimizer):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
         if lr < 0.0:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
         if eps < 0.0:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")
         if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
         if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
 
         self.degenerated_to_sgd = degenerated_to_sgd
-        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
+        if isinstance(params, list | tuple) and len(params) > 0 and isinstance(params[0], dict):
             for param in params:
                 if "betas" in param and (param["betas"][0] != betas[0] or param["betas"][1] != betas[1]):
                     param["buffer"] = [[None, None, None] for _ in range(10)]
diff --git a/TTS/utils/samplers.py b/TTS/utils/samplers.py
index b08a763a33..d24733977a 100644
--- a/TTS/utils/samplers.py
+++ b/TTS/utils/samplers.py
@@ -1,6 +1,6 @@
 import math
 import random
-from typing import Callable, List, Union
+from collections.abc import Callable
 
 from torch.utils.data.sampler import BatchSampler, Sampler, SubsetRandomSampler
 
@@ -49,9 +49,9 @@ def __init__(
         label_key="class_name",
     ):
         super().__init__(dataset_items)
-        assert (
-            batch_size % (num_classes_in_batch * num_gpus) == 0
-        ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."
+        assert batch_size % (num_classes_in_batch * num_gpus) == 0, (
+            "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."
+        )
 
         label_indices = {}
         for idx, item in enumerate(dataset_items):
@@ -176,7 +176,7 @@ def __init__(
         data,
         batch_size,
         drop_last,
-        sort_key: Union[Callable, List] = identity,
+        sort_key: Callable | list = identity,
         bucket_size_multiplier=100,
     ):
         super().__init__(sampler, batch_size, drop_last)
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 90af4f48f9..cebb094a48 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -1,7 +1,8 @@
 import logging
 import os
 import time
-from typing import List
+from pathlib import Path
+from typing import Any
 
 import numpy as np
 import pysbd
@@ -15,7 +16,10 @@
 from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.audio.numpy_transforms import save_wav
+from TTS.utils.generic_utils import optional_to_str
+from TTS.vc.configs.openvoice_config import OpenVoiceConfig
 from TTS.vc.models import setup_model as setup_vc_model
+from TTS.vc.models.openvoice import OpenVoice
 from TTS.vocoder.models import setup_model as setup_vocoder_model
 from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input
 
@@ -25,18 +29,19 @@
 class Synthesizer(nn.Module):
     def __init__(
         self,
-        tts_checkpoint: str = "",
-        tts_config_path: str = "",
-        tts_speakers_file: str = "",
-        tts_languages_file: str = "",
-        vocoder_checkpoint: str = "",
-        vocoder_config: str = "",
-        encoder_checkpoint: str = "",
-        encoder_config: str = "",
-        vc_checkpoint: str = "",
-        vc_config: str = "",
-        model_dir: str = "",
-        voice_dir: str = None,
+        *,
+        tts_checkpoint: str | os.PathLike[Any] | None = None,
+        tts_config_path: str | os.PathLike[Any] | None = None,
+        tts_speakers_file: str | os.PathLike[Any] | None = None,
+        tts_languages_file: str | os.PathLike[Any] | None = None,
+        vocoder_checkpoint: str | os.PathLike[Any] | None = None,
+        vocoder_config: str | os.PathLike[Any] | None = None,
+        encoder_checkpoint: str | os.PathLike[Any] | None = None,
+        encoder_config: str | os.PathLike[Any] | None = None,
+        vc_checkpoint: str | os.PathLike[Any] | None = None,
+        vc_config: str | os.PathLike[Any] | None = None,
+        model_dir: str | os.PathLike[Any] | None = None,
+        voice_dir: str | os.PathLike[Any] | None = None,
         use_cuda: bool = False,
     ) -> None:
         """General 🐸 TTS interface for inference. It takes a tts and a vocoder
@@ -62,16 +67,17 @@ def __init__(
             use_cuda (bool, optional): enable/disable cuda. Defaults to False.
         """
         super().__init__()
-        self.tts_checkpoint = tts_checkpoint
-        self.tts_config_path = tts_config_path
-        self.tts_speakers_file = tts_speakers_file
-        self.tts_languages_file = tts_languages_file
-        self.vocoder_checkpoint = vocoder_checkpoint
-        self.vocoder_config = vocoder_config
-        self.encoder_checkpoint = encoder_checkpoint
-        self.encoder_config = encoder_config
-        self.vc_checkpoint = vc_checkpoint
-        self.vc_config = vc_config
+        self.tts_checkpoint = optional_to_str(tts_checkpoint)
+        self.tts_config_path = optional_to_str(tts_config_path)
+        self.tts_speakers_file = optional_to_str(tts_speakers_file)
+        self.tts_languages_file = optional_to_str(tts_languages_file)
+        self.vocoder_checkpoint = optional_to_str(vocoder_checkpoint)
+        self.vocoder_config = optional_to_str(vocoder_config)
+        self.encoder_checkpoint = optional_to_str(encoder_checkpoint)
+        self.encoder_config = optional_to_str(encoder_config)
+        self.vc_checkpoint = optional_to_str(vc_checkpoint)
+        self.vc_config = optional_to_str(vc_config)
+        model_dir = optional_to_str(model_dir)
         self.use_cuda = use_cuda
 
         self.tts_model = None
@@ -90,24 +96,21 @@ def __init__(
             assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
 
         if tts_checkpoint:
-            self._load_tts(tts_checkpoint, tts_config_path, use_cuda)
-            self.output_sample_rate = self.tts_config.audio["sample_rate"]
+            self._load_tts(self.tts_checkpoint, self.tts_config_path, use_cuda)
 
-        if vocoder_checkpoint:
-            self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda)
-            self.output_sample_rate = self.vocoder_config.audio["sample_rate"]
+        if vc_checkpoint and model_dir == "":
+            self._load_vc(self.vc_checkpoint, self.vc_config, use_cuda)
 
-        if vc_checkpoint:
-            self._load_vc(vc_checkpoint, vc_config, use_cuda)
-            self.output_sample_rate = self.vc_config.audio["output_sample_rate"]
+        if vocoder_checkpoint:
+            self._load_vocoder(self.vocoder_checkpoint, self.vocoder_config, use_cuda)
 
         if model_dir:
             if "fairseq" in model_dir:
                 self._load_fairseq_from_dir(model_dir, use_cuda)
-                self.output_sample_rate = self.tts_config.audio["sample_rate"]
+            elif "openvoice" in model_dir:
+                self._load_openvoice_from_dir(Path(model_dir), use_cuda)
             else:
                 self._load_tts_from_dir(model_dir, use_cuda)
-                self.output_sample_rate = self.tts_config.audio["output_sample_rate"]
 
     @staticmethod
     def _get_segmenter(lang: str):
@@ -136,6 +139,9 @@ def _load_vc(self, vc_checkpoint: str, vc_config_path: str, use_cuda: bool) -> N
         """
         # pylint: disable=global-statement
         self.vc_config = load_config(vc_config_path)
+        self.output_sample_rate = self.vc_config.audio.get(
+            "output_sample_rate", self.vc_config.audio.get("sample_rate", None)
+        )
         self.vc_model = setup_vc_model(config=self.vc_config)
         self.vc_model.load_checkpoint(self.vc_config, vc_checkpoint)
         if use_cuda:
@@ -150,9 +156,24 @@ def _load_fairseq_from_dir(self, model_dir: str, use_cuda: bool) -> None:
         self.tts_model = Vits.init_from_config(self.tts_config)
         self.tts_model.load_fairseq_checkpoint(self.tts_config, checkpoint_dir=model_dir, eval=True)
         self.tts_config = self.tts_model.config
+        self.output_sample_rate = self.tts_config.audio["sample_rate"]
         if use_cuda:
             self.tts_model.cuda()
 
+    def _load_openvoice_from_dir(self, checkpoint: Path, use_cuda: bool) -> None:
+        """Load the OpenVoice model from a directory.
+
+        We assume the model knows how to load itself from the directory and
+        there is a config.json file in the directory.
+        """
+        self.vc_config = OpenVoiceConfig()
+        self.vc_model = OpenVoice.init_from_config(self.vc_config)
+        self.vc_model.load_checkpoint(self.vc_config, checkpoint, eval=True)
+        self.vc_config = self.vc_model.config
+        self.output_sample_rate = self.vc_config.audio["output_sample_rate"]
+        if use_cuda:
+            self.vc_model.cuda()
+
     def _load_tts_from_dir(self, model_dir: str, use_cuda: bool) -> None:
         """Load the TTS model from a directory.
 
@@ -160,6 +181,7 @@ def _load_tts_from_dir(self, model_dir: str, use_cuda: bool) -> None:
         """
         config = load_config(os.path.join(model_dir, "config.json"))
         self.tts_config = config
+        self.output_sample_rate = self.tts_config.audio["output_sample_rate"]
         self.tts_model = setup_tts_model(config)
         self.tts_model.load_checkpoint(config, checkpoint_dir=model_dir, eval=True)
         if use_cuda:
@@ -181,6 +203,7 @@ def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -
         """
         # pylint: disable=global-statement
         self.tts_config = load_config(tts_config_path)
+        self.output_sample_rate = self.tts_config.audio["sample_rate"]
         if self.tts_config["use_phonemes"] and self.tts_config["phonemizer"] is None:
             raise ValueError("Phonemizer is not defined in the TTS config.")
 
@@ -218,13 +241,14 @@ def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> N
             use_cuda (bool): enable/disable CUDA use.
         """
         self.vocoder_config = load_config(model_config)
+        self.output_sample_rate = self.vocoder_config.audio["sample_rate"]
         self.vocoder_ap = AudioProcessor(**self.vocoder_config.audio)
         self.vocoder_model = setup_vocoder_model(self.vocoder_config)
         self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True)
         if use_cuda:
             self.vocoder_model.cuda()
 
-    def split_into_sentences(self, text) -> List[str]:
+    def split_into_sentences(self, text) -> list[str]:
         """Split give text into sentences.
 
         Args:
@@ -235,7 +259,7 @@ def split_into_sentences(self, text) -> List[str]:
         """
         return self.seg.segment(text)
 
-    def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None:
+    def save_wav(self, wav: list[int], path: str, pipe_out=None) -> None:
         """Save the waveform as a file.
 
         Args:
@@ -250,9 +274,21 @@ def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None:
             wav = np.array(wav)
         save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out)
 
-    def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]:
-        output_wav = self.vc_model.voice_conversion(source_wav, target_wav)
-        return output_wav
+    def voice_conversion(self, source_wav: str, target_wav: str | list[str], **kwargs) -> list[int]:
+        start_time = time.time()
+
+        if not isinstance(target_wav, list):
+            target_wav = [target_wav]
+        output = self.vc_model.voice_conversion(source_wav, target_wav, **kwargs)
+        if self.vocoder_model is not None:
+            output = self.vocoder_model.inference(output)
+
+        output = output.squeeze()
+        process_time = time.time() - start_time
+        audio_time = len(output) / self.output_sample_rate
+        logger.info("Processing time: %.3f", process_time)
+        logger.info("Real-time factor: %.3f", process_time / audio_time)
+        return output
 
     def tts(
         self,
@@ -266,7 +302,7 @@ def tts(
         reference_speaker_name=None,
         split_sentences: bool = True,
         **kwargs,
-    ) -> List[int]:
+    ) -> list[int]:
         """🐸 TTS magic. Run all the models and generate speech.
 
         Args:
diff --git a/TTS/vc/configs/freevc_config.py b/TTS/vc/configs/freevc_config.py
index 207181b303..37f8048b7f 100644
--- a/TTS/vc/configs/freevc_config.py
+++ b/TTS/vc/configs/freevc_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List, Optional
 
 from coqpit import Coqpit
 
@@ -47,7 +46,7 @@ class FreeVCAudioConfig(Coqpit):
     win_length: int = field(default=1280)
     n_mel_channels: int = field(default=80)
     mel_fmin: float = field(default=0.0)
-    mel_fmax: Optional[float] = field(default=None)
+    mel_fmax: float | None = field(default=None)
 
 
 @dataclass
@@ -122,11 +121,11 @@ class FreeVCArgs(Coqpit):
     kernel_size: int = field(default=3)
     p_dropout: float = field(default=0.1)
     resblock: str = field(default="1")
-    resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11])
-    resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
-    upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2])
+    resblock_kernel_sizes: list[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates: list[int] = field(default_factory=lambda: [10, 8, 2, 2])
     upsample_initial_channel: int = field(default=512)
-    upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    upsample_kernel_sizes: list[int] = field(default_factory=lambda: [16, 16, 4, 4])
     n_layers_q: int = field(default=3)
     use_spectral_norm: bool = field(default=False)
     gin_channels: int = field(default=256)
@@ -229,7 +228,7 @@ class FreeVCConfig(BaseVCConfig):
             If true, language embedding is used. Defaults to `False`.
 
     Note:
-        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
+        Check :class:`TTS.tts.configs.shared_configs.BaseVCConfig` for the inherited parameters.
 
     Example:
 
@@ -269,7 +268,7 @@ class FreeVCConfig(BaseVCConfig):
 
     # use d-vectors
     use_d_vector_file: bool = False
-    d_vector_file: List[str] = None
+    d_vector_file: list[str] = None
     d_vector_dim: int = None
 
     def __post_init__(self):
diff --git a/TTS/vc/configs/knnvc_config.py b/TTS/vc/configs/knnvc_config.py
new file mode 100644
index 0000000000..7728ea0a9b
--- /dev/null
+++ b/TTS/vc/configs/knnvc_config.py
@@ -0,0 +1,59 @@
+from dataclasses import dataclass, field
+
+from coqpit import Coqpit
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.vc.configs.shared_configs import BaseVCConfig
+
+
+@dataclass
+class KNNVCAudioConfig(BaseAudioConfig):
+    """Audio configuration.
+
+    Args:
+        sample_rate (int):
+            The sampling rate of the input waveform.
+    """
+
+    sample_rate: int = field(default=16000)
+
+
+@dataclass
+class KNNVCArgs(Coqpit):
+    """Model arguments.
+
+    Args:
+        ssl_dim (int):
+            The dimension of the self-supervised learning embedding.
+    """
+
+    ssl_dim: int = field(default=1024)
+
+
+@dataclass
+class KNNVCConfig(BaseVCConfig):
+    """Parameters.
+
+    Args:
+        model (str):
+            Model name. Do not change unless you know what you are doing.
+
+        model_args (KNNVCArgs):
+            Model architecture arguments. Defaults to `KNNVCArgs()`.
+
+        audio (KNNVCAudioConfig):
+            Audio processing configuration. Defaults to `KNNVCAudioConfig()`.
+
+        wavlm_layer (int):
+            WavLM layer to use for feature extraction.
+
+        topk (int):
+            k in the kNN -- the number of nearest neighbors to average over
+    """
+
+    model: str = "knnvc"
+    model_args: KNNVCArgs = field(default_factory=KNNVCArgs)
+    audio: KNNVCAudioConfig = field(default_factory=KNNVCAudioConfig)
+
+    wavlm_layer: int = 6
+    topk: int = 4
diff --git a/TTS/vc/configs/openvoice_config.py b/TTS/vc/configs/openvoice_config.py
new file mode 100644
index 0000000000..167a61ddb3
--- /dev/null
+++ b/TTS/vc/configs/openvoice_config.py
@@ -0,0 +1,200 @@
+from dataclasses import dataclass, field
+
+from coqpit import Coqpit
+
+from TTS.vc.configs.shared_configs import BaseVCConfig
+
+
+@dataclass
+class OpenVoiceAudioConfig(Coqpit):
+    """Audio configuration
+
+    Args:
+        input_sample_rate (int):
+            The sampling rate of the input waveform.
+
+        output_sample_rate (int):
+            The sampling rate of the output waveform.
+
+        fft_size (int):
+            The length of the filter.
+
+        hop_length (int):
+            The hop length.
+
+        win_length (int):
+            The window length.
+    """
+
+    input_sample_rate: int = field(default=22050)
+    output_sample_rate: int = field(default=22050)
+    fft_size: int = field(default=1024)
+    hop_length: int = field(default=256)
+    win_length: int = field(default=1024)
+
+
+@dataclass
+class OpenVoiceArgs(Coqpit):
+    """OpenVoice model arguments.
+
+    zero_g (bool):
+        Whether to zero the gradients.
+
+    inter_channels (int):
+        The number of channels in the intermediate layers.
+
+    hidden_channels (int):
+        The number of channels in the hidden layers.
+
+    filter_channels (int):
+        The number of channels in the filter layers.
+
+    n_heads (int):
+        The number of attention heads.
+
+    n_layers (int):
+        The number of layers.
+
+    kernel_size (int):
+        The size of the kernel.
+
+    p_dropout (float):
+        The dropout probability.
+
+    resblock (str):
+        The type of residual block.
+
+    resblock_kernel_sizes (List[int]):
+        The kernel sizes for the residual blocks.
+
+    resblock_dilation_sizes (List[List[int]]):
+        The dilation sizes for the residual blocks.
+
+    upsample_rates (List[int]):
+        The upsample rates.
+
+    upsample_initial_channel (int):
+        The number of channels in the initial upsample layer.
+
+    upsample_kernel_sizes (List[int]):
+        The kernel sizes for the upsample layers.
+
+    n_layers_q (int):
+        The number of layers in the quantization network.
+
+    use_spectral_norm (bool):
+        Whether to use spectral normalization.
+
+    gin_channels (int):
+        The number of channels in the global conditioning vector.
+
+    tau (float):
+        Tau parameter for the posterior encoder
+    """
+
+    zero_g: bool = field(default=True)
+    inter_channels: int = field(default=192)
+    hidden_channels: int = field(default=192)
+    filter_channels: int = field(default=768)
+    n_heads: int = field(default=2)
+    n_layers: int = field(default=6)
+    kernel_size: int = field(default=3)
+    p_dropout: float = field(default=0.1)
+    resblock: str = field(default="1")
+    resblock_kernel_sizes: list[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates: list[int] = field(default_factory=lambda: [8, 8, 2, 2])
+    upsample_initial_channel: int = field(default=512)
+    upsample_kernel_sizes: list[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    n_layers_q: int = field(default=3)
+    use_spectral_norm: bool = field(default=False)
+    gin_channels: int = field(default=256)
+    tau: float = field(default=0.3)
+
+
+@dataclass
+class OpenVoiceConfig(BaseVCConfig):
+    """Defines parameters for OpenVoice VC model.
+
+    Args:
+        model (str):
+            Model name. Do not change unless you know what you are doing.
+
+        model_args (OpenVoiceArgs):
+            Model architecture arguments. Defaults to `OpenVoiceArgs()`.
+
+        audio (OpenVoiceAudioConfig):
+            Audio processing configuration. Defaults to `OpenVoiceAudioConfig()`.
+
+        return_wav (bool):
+            If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
+
+        compute_linear_spec (bool):
+            If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
+
+        use_weighted_sampler (bool):
+            If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
+
+        weighted_sampler_attrs (dict):
+            Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
+            by overweighting `root_path` by 2.0. Defaults to `{}`.
+
+        weighted_sampler_multipliers (dict):
+            Weight each unique value of a key returned by the formatter for weighted sampling.
+            For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
+            It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
+
+        r (int):
+            Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
+
+        add_blank (bool):
+            If true, a blank token is added in between every character. Defaults to `True`.
+
+    Note:
+        Check :class:`TTS.tts.configs.shared_configs.BaseVCConfig` for the inherited parameters.
+
+    Example:
+
+        >>> from TTS.vc.configs.openvoice_config import OpenVoiceConfig
+        >>> config = OpenVoiceConfig()
+    """
+
+    model: str = "openvoice"
+    # model specific params
+    model_args: OpenVoiceArgs = field(default_factory=OpenVoiceArgs)
+    audio: OpenVoiceAudioConfig = field(default_factory=OpenVoiceAudioConfig)
+
+    # optimizer
+    # TODO with training support
+
+    # loss params
+    # TODO with training support
+
+    # data loader params
+    return_wav: bool = True
+    compute_linear_spec: bool = True
+
+    # sampler params
+    use_weighted_sampler: bool = False  # TODO: move it to the base config
+    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+
+    # overrides
+    r: int = 1  # DO NOT CHANGE
+    add_blank: bool = True
+
+    # multi-speaker settings
+    # use speaker embedding layer
+    num_speakers: int = 0
+    speakers_file: str | None = None
+    speaker_embedding_channels: int = 256
+
+    # use d-vectors
+    use_d_vector_file: bool = False
+    d_vector_file: list[str] | None = None
+    d_vector_dim: int | None = None
+
+    def __post_init__(self) -> None:
+        for key, val in self.model_args.items():
+            if hasattr(self, key):
+                self[key] = val
diff --git a/TTS/vc/configs/shared_configs.py b/TTS/vc/configs/shared_configs.py
index b2fe63d29d..b84a97e487 100644
--- a/TTS/vc/configs/shared_configs.py
+++ b/TTS/vc/configs/shared_configs.py
@@ -1,12 +1,11 @@
 from dataclasses import dataclass, field
-from typing import List
 
 from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
 
 
 @dataclass
 class BaseVCConfig(BaseTrainingConfig):
-    """Shared parameters among all the tts models.
+    """Shared parameters among all the VC models.
 
     Args:
 
@@ -132,7 +131,7 @@ class BaseVCConfig(BaseTrainingConfig):
     shuffle: bool = False
     drop_last: bool = False
     # dataset
-    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
+    datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
     # optimizer
     optimizer: str = "radam"
     optimizer_params: dict = None
@@ -140,7 +139,7 @@ class BaseVCConfig(BaseTrainingConfig):
     lr_scheduler: str = None
     lr_scheduler_params: dict = field(default_factory=lambda: {})
     # testing
-    test_sentences: List[str] = field(default_factory=lambda: [])
+    test_sentences: list[str] = field(default_factory=lambda: [])
     # evaluation
     eval_split_max_size: int = None
     eval_split_size: float = 0.01
diff --git a/TTS/vc/modules/__init__.py b/TTS/vc/layers/__init__.py
similarity index 100%
rename from TTS/vc/modules/__init__.py
rename to TTS/vc/layers/__init__.py
diff --git a/TTS/vc/modules/freevc/__init__.py b/TTS/vc/layers/freevc/__init__.py
similarity index 100%
rename from TTS/vc/modules/freevc/__init__.py
rename to TTS/vc/layers/freevc/__init__.py
diff --git a/TTS/vc/modules/freevc/commons.py b/TTS/vc/layers/freevc/commons.py
similarity index 81%
rename from TTS/vc/modules/freevc/commons.py
rename to TTS/vc/layers/freevc/commons.py
index feea7f34dc..49889e4816 100644
--- a/TTS/vc/modules/freevc/commons.py
+++ b/TTS/vc/layers/freevc/commons.py
@@ -3,7 +3,7 @@
 import torch
 from torch.nn import functional as F
 
-from TTS.tts.utils.helpers import convert_pad_shape, sequence_mask
+from TTS.tts.utils.helpers import convert_pad_shape
 
 
 def init_weights(m: torch.nn.Module, mean: float = 0.0, std: float = 0.01) -> None:
@@ -96,37 +96,11 @@ def subsequent_mask(length):
     return mask
 
 
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-    n_channels_int = n_channels[0]
-    in_act = input_a + input_b
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-
-
 def shift_1d(x):
     x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
     return x
 
 
-def generate_path(duration, mask):
-    """
-    duration: [b, 1, t_x]
-    mask: [b, 1, t_y, t_x]
-    """
-    b, _, t_y, t_x = mask.shape
-    cum_duration = torch.cumsum(duration, -1)
-
-    cum_duration_flat = cum_duration.view(b * t_x)
-    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
-    path = path.view(b, t_x, t_y)
-    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
-    path = path.unsqueeze(1).transpose(2, 3) * mask
-    return path
-
-
 def clip_grad_value_(parameters, clip_value, norm_type=2):
     if isinstance(parameters, torch.Tensor):
         parameters = [parameters]
diff --git a/TTS/vc/layers/freevc/mel_processing.py b/TTS/vc/layers/freevc/mel_processing.py
new file mode 100644
index 0000000000..017d900284
--- /dev/null
+++ b/TTS/vc/layers/freevc/mel_processing.py
@@ -0,0 +1,58 @@
+import logging
+
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+
+from TTS.utils.audio.torch_transforms import amp_to_db
+
+logger = logging.getLogger(__name__)
+
+MAX_WAV_VALUE = 32768.0
+
+mel_basis = {}
+hann_window = {}
+
+
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.0:
+        logger.info("Min value is: %.3f", torch.min(y))
+    if torch.max(y) > 1.0:
+        logger.info("Max value is: %.3f", torch.max(y))
+
+    global mel_basis, hann_window
+    dtype_device = str(y.dtype) + "_" + str(y.device)
+    fmax_dtype_device = str(fmax) + "_" + dtype_device
+    wnsize_dtype_device = str(win_size) + "_" + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
+    )
+    y = y.squeeze(1)
+
+    spec = torch.view_as_real(
+        torch.stft(
+            y,
+            n_fft,
+            hop_length=hop_size,
+            win_length=win_size,
+            window=hann_window[wnsize_dtype_device],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+    )
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = amp_to_db(spec)
+
+    return spec
diff --git a/TTS/vc/modules/freevc/modules.py b/TTS/vc/layers/freevc/modules.py
similarity index 97%
rename from TTS/vc/modules/freevc/modules.py
rename to TTS/vc/layers/freevc/modules.py
index 722444a303..92df39b5e0 100644
--- a/TTS/vc/modules/freevc/modules.py
+++ b/TTS/vc/layers/freevc/modules.py
@@ -5,9 +5,9 @@
 from torch.nn.utils.parametrizations import weight_norm
 from torch.nn.utils.parametrize import remove_parametrizations
 
-import TTS.vc.modules.freevc.commons as commons
 from TTS.tts.layers.generic.normalization import LayerNorm2
-from TTS.vc.modules.freevc.commons import init_weights
+from TTS.tts.layers.generic.wavenet import fused_add_tanh_sigmoid_multiply
+from TTS.vc.layers.freevc.commons import init_weights
 from TTS.vocoder.models.hifigan_generator import get_padding
 
 LRELU_SLOPE = 0.1
@@ -48,7 +48,7 @@ def forward(self, x, x_mask):
 
 class WN(torch.nn.Module):
     def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
-        super(WN, self).__init__()
+        super().__init__()
         assert kernel_size % 2 == 1
         self.hidden_channels = hidden_channels
         self.kernel_size = (kernel_size,)
@@ -99,7 +99,7 @@ def forward(self, x, x_mask, g=None, **kwargs):
             else:
                 g_l = torch.zeros_like(x_in)
 
-            acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+            acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
             acts = self.drop(acts)
 
             res_skip_acts = self.res_skip_layers[i](acts)
@@ -122,7 +122,7 @@ def remove_weight_norm(self):
 
 class ResBlock1(torch.nn.Module):
     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock1, self).__init__()
+        super().__init__()
         self.convs1 = nn.ModuleList(
             [
                 weight_norm(
@@ -198,7 +198,7 @@ def remove_weight_norm(self):
 
 class ResBlock2(torch.nn.Module):
     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
-        super(ResBlock2, self).__init__()
+        super().__init__()
         self.convs = nn.ModuleList(
             [
                 weight_norm(
diff --git a/TTS/vc/modules/freevc/speaker_encoder/__init__.py b/TTS/vc/layers/freevc/speaker_encoder/__init__.py
similarity index 100%
rename from TTS/vc/modules/freevc/speaker_encoder/__init__.py
rename to TTS/vc/layers/freevc/speaker_encoder/__init__.py
diff --git a/TTS/vc/modules/freevc/speaker_encoder/audio.py b/TTS/vc/layers/freevc/speaker_encoder/audio.py
similarity index 92%
rename from TTS/vc/modules/freevc/speaker_encoder/audio.py
rename to TTS/vc/layers/freevc/speaker_encoder/audio.py
index 5b23a4dbb6..5d14bf2f19 100644
--- a/TTS/vc/modules/freevc/speaker_encoder/audio.py
+++ b/TTS/vc/layers/freevc/speaker_encoder/audio.py
@@ -1,11 +1,10 @@
 from pathlib import Path
-from typing import Optional, Union
 
 # import webrtcvad
 import librosa
 import numpy as np
 
-from TTS.vc.modules.freevc.speaker_encoder.hparams import (
+from TTS.vc.layers.freevc.speaker_encoder.hparams import (
     audio_norm_target_dBFS,
     mel_n_channels,
     mel_window_length,
@@ -16,7 +15,7 @@
 int16_max = (2**15) - 1
 
 
-def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None):
+def preprocess_wav(fpath_or_wav: str | Path | np.ndarray, source_sr: int | None = None):
     """
     Applies the preprocessing operations used in training the Speaker Encoder to a waveform
     either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
diff --git a/TTS/vc/modules/freevc/speaker_encoder/hparams.py b/TTS/vc/layers/freevc/speaker_encoder/hparams.py
similarity index 100%
rename from TTS/vc/modules/freevc/speaker_encoder/hparams.py
rename to TTS/vc/layers/freevc/speaker_encoder/hparams.py
diff --git a/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py
similarity index 81%
rename from TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py
rename to TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py
index 294bf322cb..d2f4ffe394 100644
--- a/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py
+++ b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py
@@ -1,14 +1,13 @@
 import logging
 from time import perf_counter as timer
-from typing import List, Union
 
 import numpy as np
 import torch
 from torch import nn
 from trainer.io import load_fsspec
 
-from TTS.vc.modules.freevc.speaker_encoder import audio
-from TTS.vc.modules.freevc.speaker_encoder.hparams import (
+from TTS.vc.layers.freevc.speaker_encoder import audio
+from TTS.vc.layers.freevc.speaker_encoder.hparams import (
     mel_n_channels,
     mel_window_step,
     model_embedding_size,
@@ -22,12 +21,8 @@
 
 
 class SpeakerEncoder(nn.Module):
-    def __init__(self, weights_fpath, device: Union[str, torch.device] = None):
-        """
-        :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda").
-        If None, defaults to cuda if it is available on your machine, otherwise the model will
-        run on cpu. Outputs are always returned on the cpu, as numpy arrays.
-        """
+    def __init__(self, weights_fpath):
+        """FreeVC speaker encoder."""
         super().__init__()
 
         # Define the network
@@ -35,13 +30,6 @@ def __init__(self, weights_fpath, device: Union[str, torch.device] = None):
         self.linear = nn.Linear(model_hidden_size, model_embedding_size)
         self.relu = nn.ReLU()
 
-        # Get the target device
-        if device is None:
-            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        elif isinstance(device, str):
-            device = torch.device(device)
-        self.device = device
-
         # Load the pretrained model'speaker weights
         # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt")
         # if not weights_fpath.exists():
@@ -52,8 +40,11 @@ def __init__(self, weights_fpath, device: Union[str, torch.device] = None):
         checkpoint = load_fsspec(weights_fpath, map_location="cpu")
 
         self.load_state_dict(checkpoint["model_state"], strict=False)
-        self.to(device)
-        logger.info("Loaded the voice encoder model on %s in %.2f seconds.", device.type, timer() - start)
+        logger.info("Loaded the voice encoder model in %.2f seconds.", timer() - start)
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
 
     def forward(self, mels: torch.FloatTensor):
         """
@@ -97,7 +88,7 @@ def compute_partial_slices(n_samples: int, rate, min_coverage):
         assert 0 < min_coverage <= 1
 
         # Compute how many frames separate two partial utterances
-        samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+        samples_per_frame = int(sampling_rate * mel_window_step / 1000)
         n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
         frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
         assert 0 < frame_step, "The rate is too high"
@@ -123,7 +114,7 @@ def compute_partial_slices(n_samples: int, rate, min_coverage):
 
         return wav_slices, mel_slices
 
-    def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75):
+    def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75) -> torch.Tensor:
         """
         Computes an embedding for a single utterance. The utterance is divided in partial
         utterances and an embedding is computed for each. The complete utterance embedding is the
@@ -143,8 +134,8 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_
         then the last partial utterance will be considered by zero-padding the audio. Otherwise,
         it will be discarded. If there aren't enough frames for one partial utterance,
         this parameter is ignored so that the function always returns at least one slice.
-        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
-        <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+        :return: the embedding as a float tensor of shape (model_embedding_size,). If
+        <return_partials> is True, the partial utterances as a float tensor of shape
         (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
         returned.
         """
@@ -160,24 +151,26 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_
         mels = np.array([mel[s] for s in mel_slices])
         with torch.no_grad():
             mels = torch.from_numpy(mels).to(self.device)
-            partial_embeds = self(mels).cpu().numpy()
+            partial_embeds = self(mels)
 
         # Compute the utterance embedding from the partial embeddings
-        raw_embed = np.mean(partial_embeds, axis=0)
-        embed = raw_embed / np.linalg.norm(raw_embed, 2)
+        raw_embed = partial_embeds.mean(dim=0)
+        embed = raw_embed / torch.norm(raw_embed, p=2)
 
         if return_partials:
             return embed, partial_embeds, wav_slices
         return embed
 
-    def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
+    def embed_speaker(self, wavs: list[np.ndarray], **kwargs):
         """
         Compute the embedding of a collection of wavs (presumably from the same speaker) by
         averaging their embedding and L2-normalizing it.
 
         :param wavs: list of wavs a numpy arrays of float32.
         :param kwargs: extra arguments to embed_utterance()
-        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
+        :return: the embedding as a float tensor of shape (model_embedding_size,).
         """
-        raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) for wav in wavs], axis=0)
-        return raw_embed / np.linalg.norm(raw_embed, 2)
+        raw_embed = torch.mean(
+            torch.stack([self.embed_utterance(wav, return_partials=False, **kwargs) for wav in wavs]), dim=0
+        )
+        return raw_embed / torch.norm(raw_embed, p=2)
diff --git a/TTS/vc/modules/freevc/wavlm/__init__.py b/TTS/vc/layers/freevc/wavlm/__init__.py
similarity index 91%
rename from TTS/vc/modules/freevc/wavlm/__init__.py
rename to TTS/vc/layers/freevc/wavlm/__init__.py
index 4046e137f5..d9c3858f89 100644
--- a/TTS/vc/modules/freevc/wavlm/__init__.py
+++ b/TTS/vc/layers/freevc/wavlm/__init__.py
@@ -6,14 +6,14 @@
 from trainer.io import get_user_data_dir
 
 from TTS.utils.generic_utils import is_pytorch_at_least_2_4
-from TTS.vc.modules.freevc.wavlm.wavlm import WavLM, WavLMConfig
+from TTS.vc.layers.freevc.wavlm.wavlm import WavLM, WavLMConfig
 
 logger = logging.getLogger(__name__)
 
 model_uri = "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/WavLM-Large.pt"
 
 
-def get_wavlm(device="cpu"):
+def get_wavlm(device="cpu") -> WavLM:
     """Download the model and return the model object."""
 
     output_path = get_user_data_dir("tts")
diff --git a/TTS/vc/modules/freevc/wavlm/config.json b/TTS/vc/layers/freevc/wavlm/config.json
similarity index 100%
rename from TTS/vc/modules/freevc/wavlm/config.json
rename to TTS/vc/layers/freevc/wavlm/config.json
diff --git a/TTS/vc/modules/freevc/wavlm/modules.py b/TTS/vc/layers/freevc/wavlm/modules.py
similarity index 96%
rename from TTS/vc/modules/freevc/wavlm/modules.py
rename to TTS/vc/layers/freevc/wavlm/modules.py
index 37c1a6e877..cf31a866de 100644
--- a/TTS/vc/modules/freevc/wavlm/modules.py
+++ b/TTS/vc/layers/freevc/wavlm/modules.py
@@ -9,7 +9,6 @@
 
 import math
 import warnings
-from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -89,7 +88,7 @@ class Swish(nn.Module):
 
     def __init__(self):
         """Construct an MultiHeadedAttention object."""
-        super(Swish, self).__init__()
+        super().__init__()
         self.act = torch.nn.Sigmoid()
 
     def forward(self, x):
@@ -98,7 +97,7 @@ def forward(self, x):
 
 class GLU_Linear(nn.Module):
     def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True):
-        super(GLU_Linear, self).__init__()
+        super().__init__()
 
         self.glu_type = glu_type
         self.output_dim = output_dim
@@ -158,7 +157,7 @@ def get_activation_fn(activation: str):
     elif activation == "glu":
         return lambda x: x
     else:
-        raise RuntimeError("--activation-fn {} not supported".format(activation))
+        raise RuntimeError(f"--activation-fn {activation} not supported")
 
 
 def init_bert_params(module):
@@ -219,7 +218,7 @@ def quant_noise(module, p, block_size):
         return module
 
     # supported modules
-    assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
+    assert isinstance(module, nn.Linear | nn.Embedding | nn.Conv2d)
 
     # test whether module.weight has the right sizes wrt block_size
     is_conv = module.weight.ndim == 4
@@ -331,7 +330,7 @@ def __init__(
         self.encoder_decoder_attention = encoder_decoder_attention
 
         assert not self.self_attention or self.qkv_same_dim, (
-            "Self-attention requires query, key and " "value to be of the same size"
+            "Self-attention requires query, key and value to be of the same size"
         )
 
         k_bias = True
@@ -424,17 +423,17 @@ def compute_bias(self, query_length, key_length):
     def forward(
         self,
         query,
-        key: Optional[Tensor],
-        value: Optional[Tensor],
-        key_padding_mask: Optional[Tensor] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        key: Tensor | None,
+        value: Tensor | None,
+        key_padding_mask: Tensor | None = None,
+        incremental_state: dict[str, dict[str, Tensor | None]] | None = None,
         need_weights: bool = True,
         static_kv: bool = False,
-        attn_mask: Optional[Tensor] = None,
+        attn_mask: Tensor | None = None,
         before_softmax: bool = False,
         need_head_weights: bool = False,
-        position_bias: Optional[Tensor] = None,
-    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+        position_bias: Tensor | None = None,
+    ) -> tuple[Tensor, Tensor | None, Tensor | None]:
         """Input shape: Time x Batch x Channel
 
         Args:
@@ -605,7 +604,7 @@ def forward(
                 else:
                     assert v is not None
                     v = torch.cat([prev_value, v], dim=1)
-            prev_key_padding_mask: Optional[Tensor] = None
+            prev_key_padding_mask: Tensor | None = None
             if "prev_key_padding_mask" in saved_state:
                 prev_key_padding_mask = saved_state["prev_key_padding_mask"]
             assert k is not None and v is not None
@@ -700,7 +699,7 @@ def forward(
         assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
         attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
         attn = self.out_proj(attn)
-        attn_weights: Optional[Tensor] = None
+        attn_weights: Tensor | None = None
         if need_weights:
             attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
             if not need_head_weights:
@@ -711,12 +710,12 @@ def forward(
 
     @staticmethod
     def _append_prev_key_padding_mask(
-        key_padding_mask: Optional[Tensor],
-        prev_key_padding_mask: Optional[Tensor],
+        key_padding_mask: Tensor | None,
+        prev_key_padding_mask: Tensor | None,
         batch_size: int,
         src_len: int,
         static_kv: bool,
-    ) -> Optional[Tensor]:
+    ) -> Tensor | None:
         # saved key padding masks have shape (bsz, seq_len)
         if prev_key_padding_mask is not None and static_kv:
             new_key_padding_mask = prev_key_padding_mask
@@ -748,19 +747,19 @@ def _append_prev_key_padding_mask(
         return new_key_padding_mask
 
     def _get_input_buffer(
-        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
-    ) -> Dict[str, Optional[Tensor]]:
+        self, incremental_state: dict[str, dict[str, Tensor | None]] | None
+    ) -> dict[str, Tensor | None]:
         result = self.get_incremental_state(incremental_state, "attn_state")
         if result is not None:
             return result
         else:
-            empty_result: Dict[str, Optional[Tensor]] = {}
+            empty_result: dict[str, Tensor | None] = {}
             return empty_result
 
     def _set_input_buffer(
         self,
-        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-        buffer: Dict[str, Optional[Tensor]],
+        incremental_state: dict[str, dict[str, Tensor | None]],
+        buffer: dict[str, Tensor | None],
     ):
         return self.set_incremental_state(incremental_state, "attn_state", buffer)
 
diff --git a/TTS/vc/modules/freevc/wavlm/wavlm.py b/TTS/vc/layers/freevc/wavlm/wavlm.py
similarity index 96%
rename from TTS/vc/modules/freevc/wavlm/wavlm.py
rename to TTS/vc/layers/freevc/wavlm/wavlm.py
index 10dd09ed0c..6358662e18 100644
--- a/TTS/vc/modules/freevc/wavlm/wavlm.py
+++ b/TTS/vc/layers/freevc/wavlm/wavlm.py
@@ -9,7 +9,7 @@
 
 import logging
 import math
-from typing import List, Optional, Tuple
+from typing import Any
 
 import numpy as np
 import torch
@@ -17,7 +17,7 @@
 import torch.nn.functional as F
 from torch.nn import LayerNorm
 
-from TTS.vc.modules.freevc.wavlm.modules import (
+from TTS.vc.layers.freevc.wavlm.modules import (
     Fp32GroupNorm,
     Fp32LayerNorm,
     GLU_Linear,
@@ -33,8 +33,8 @@
 
 
 def compute_mask_indices(
-    shape: Tuple[int, int],
-    padding_mask: Optional[torch.Tensor],
+    shape: tuple[int, int],
+    padding_mask: torch.Tensor | None,
     mask_prob: float,
     mask_length: int,
     mask_type: str = "static",
@@ -68,8 +68,7 @@ def compute_mask_indices(
 
     all_num_mask = int(
         # add a random number for probabilistic rounding
-        mask_prob * all_sz / float(mask_length)
-        + np.random.rand()
+        mask_prob * all_sz / float(mask_length) + np.random.rand()
     )
 
     all_num_mask = max(min_masks, all_num_mask)
@@ -80,8 +79,7 @@ def compute_mask_indices(
             sz = all_sz - padding_mask[i].long().sum().item()
             num_mask = int(
                 # add a random number for probabilistic rounding
-                mask_prob * sz / float(mask_length)
-                + np.random.rand()
+                mask_prob * sz / float(mask_length) + np.random.rand()
             )
             num_mask = max(min_masks, num_mask)
         else:
@@ -155,9 +153,7 @@ def arrange(s, e, length, keep_length):
 
 class WavLMConfig:
     def __init__(self, cfg=None):
-        self.extractor_mode: str = (
-            "default"  # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
-        )
+        self.extractor_mode: str = "default"  # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
         self.encoder_layers: int = 12  # num encoder layers in the transformer
 
         self.encoder_embed_dim: int = 768  # encoder embedding dimension
@@ -166,9 +162,7 @@ def __init__(self, cfg=None):
         self.activation_fn: str = "gelu"  # activation function to use
 
         self.layer_norm_first: bool = False  # apply layernorm first in the transformer
-        self.conv_feature_layers: str = (
-            "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"  # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
-        )
+        self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"  # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
         self.conv_bias: bool = False  # include bias in conv encoder
         self.feature_grad_mult: float = 1.0  # multiply feature extractor var grads by this
 
@@ -225,7 +219,7 @@ def __init__(
         cfg: WavLMConfig,
     ) -> None:
         super().__init__()
-        logger.info(f"WavLM Config: {cfg.__dict__}")
+        logger.info("WavLM Config: %s", cfg.__dict__)
 
         self.cfg = cfg
         feature_enc_layers = eval(cfg.conv_feature_layers)
@@ -317,12 +311,12 @@ def forward_padding_mask(
     def extract_features(
         self,
         source: torch.Tensor,
-        padding_mask: Optional[torch.Tensor] = None,
+        padding_mask: torch.Tensor | None = None,
         mask: bool = False,
         ret_conv: bool = False,
-        output_layer: Optional[int] = None,
+        output_layer: int | None = None,
         ret_layer_results: bool = False,
-    ):
+    ) -> tuple[torch.Tensor, dict[str, Any]]:
         if self.feature_grad_mult > 0:
             features = self.feature_extractor(source)
             if self.feature_grad_mult != 1.0:
@@ -367,7 +361,7 @@ def extract_features(
 class ConvFeatureExtractionModel(nn.Module):
     def __init__(
         self,
-        conv_layers: List[Tuple[int, int, int]],
+        conv_layers: list[tuple[int, int, int]],
         dropout: float = 0.0,
         mode: str = "default",
         conv_bias: bool = False,
diff --git a/TTS/vc/models/__init__.py b/TTS/vc/models/__init__.py
index a498b292b7..859eaeb2a7 100644
--- a/TTS/vc/models/__init__.py
+++ b/TTS/vc/models/__init__.py
@@ -1,20 +1,21 @@
 import importlib
 import logging
 import re
-from typing import Dict, List, Union
-
-logger = logging.getLogger(__name__)
 
+from TTS.vc.configs.shared_configs import BaseVCConfig
+from TTS.vc.models.base_vc import BaseVC
 
-def to_camel(text):
-    text = text.capitalize()
-    return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
+logger = logging.getLogger(__name__)
 
 
-def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseVC":
+def setup_model(config: BaseVCConfig) -> BaseVC:
     logger.info("Using model: %s", config.model)
     # fetch the right model implementation.
-    if "model" in config and config["model"].lower() == "freevc":
+    if config["model"].lower() == "freevc":
         MyModel = importlib.import_module("TTS.vc.models.freevc").FreeVC
-        model = MyModel.init_from_config(config, samples)
-    return model
+    elif config["model"].lower() == "knnvc":
+        MyModel = importlib.import_module("TTS.vc.models.knnvc").KNNVC
+    else:
+        msg = f"Model {config.model} does not exist!"
+        raise ValueError(msg)
+    return MyModel.init_from_config(config)
diff --git a/TTS/vc/models/base_vc.py b/TTS/vc/models/base_vc.py
index 22ffd0095c..a953b901e8 100644
--- a/TTS/vc/models/base_vc.py
+++ b/TTS/vc/models/base_vc.py
@@ -1,7 +1,7 @@
 import logging
 import os
 import random
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -37,9 +37,9 @@ class BaseVC(BaseTrainerModel):
     def __init__(
         self,
         config: Coqpit,
-        ap: AudioProcessor,
-        speaker_manager: Optional[SpeakerManager] = None,
-        language_manager: Optional[LanguageManager] = None,
+        ap: AudioProcessor | None = None,
+        speaker_manager: SpeakerManager | None = None,
+        language_manager: LanguageManager | None = None,
     ) -> None:
         super().__init__()
         self.config = config
@@ -51,7 +51,7 @@ def __init__(
     def _set_model_args(self, config: Coqpit) -> None:
         """Setup model args based on the config type (`ModelConfig` or `ModelArgs`).
 
-        `ModelArgs` has all the fields reuqired to initialize the model architecture.
+        `ModelArgs` has all the fields required to initialize the model architecture.
 
         `ModelConfig` has all the fields required for training, inference and containes `ModelArgs`.
 
@@ -69,7 +69,7 @@ def _set_model_args(self, config: Coqpit) -> None:
         else:
             raise ValueError("config must be either a *Config or *Args")
 
-    def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) -> None:
+    def init_multispeaker(self, config: Coqpit, data: list[Any] | None = None) -> None:
         """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining
         `in_channels` size of the connected layers.
 
@@ -106,7 +106,7 @@ def get_aux_input(self, **kwargs: Any) -> dict[str, Any]:
         """Prepare and return `aux_input` used by `forward()`"""
         return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None}
 
-    def get_aux_input_from_test_sentences(self, sentence_info: Union[str, list[str]]) -> dict[str, Any]:
+    def get_aux_input_from_test_sentences(self, sentence_info: str | list[str]) -> dict[str, Any]:
         if hasattr(self.config, "model_args"):
             config = self.config.model_args
         else:
@@ -199,9 +199,9 @@ def format_batch(self, batch: dict[str, Any]) -> dict[str, Any]:
                 extra_frames = dur.sum() - mel_lengths[idx]
                 largest_idxs = torch.argsort(-dur)[:extra_frames]
                 dur[largest_idxs] -= 1
-                assert (
-                    dur.sum() == mel_lengths[idx]
-                ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
+                assert dur.sum() == mel_lengths[idx], (
+                    f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
+                )
                 durations[idx, : text_lengths[idx]] = dur
 
         # set stop targets wrt reduction factor
@@ -275,10 +275,10 @@ def get_data_loader(
         config: Coqpit,
         assets: dict,
         is_eval: bool,
-        samples: Union[list[dict], list[list]],
+        samples: list[dict] | list[list],
         verbose: bool,
         num_gpus: int,
-        rank: Optional[int] = None,
+        rank: int | None = None,
     ) -> "DataLoader":
         if is_eval and not config.run_eval:
             loader = None
@@ -402,13 +402,11 @@ def test_run(self, assets: dict) -> tuple[dict, dict]:
                 use_griffin_lim=True,
                 do_trim_silence=False,
             )
-            test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
-            test_figures["{}-prediction".format(idx)] = plot_spectrogram(
+            test_audios[f"{idx}-audio"] = outputs_dict["wav"]
+            test_figures[f"{idx}-prediction"] = plot_spectrogram(
                 outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
             )
-            test_figures["{}-alignment".format(idx)] = plot_alignment(
-                outputs_dict["outputs"]["alignments"], output_fig=False
-            )
+            test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False)
         return test_figures, test_audios
 
     def on_init_start(self, trainer: Trainer) -> None:
diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py
index e5cfdc1e61..59af40a836 100644
--- a/TTS/vc/models/freevc.py
+++ b/TTS/vc/models/freevc.py
@@ -1,29 +1,27 @@
 import logging
-from typing import Dict, List, Optional, Tuple, Union
 
 import librosa
 import numpy as np
 import torch
 from coqpit import Coqpit
 from torch import nn
-from torch.nn import Conv1d, Conv2d, ConvTranspose1d
+from torch.nn import Conv1d, ConvTranspose1d
 from torch.nn import functional as F
-from torch.nn.utils import spectral_norm
 from torch.nn.utils.parametrizations import weight_norm
 from torch.nn.utils.parametrize import remove_parametrizations
 from trainer.io import load_fsspec
 
-import TTS.vc.modules.freevc.commons as commons
-import TTS.vc.modules.freevc.modules as modules
+import TTS.vc.layers.freevc.modules as modules
+from TTS.tts.layers.vits.discriminator import DiscriminatorS
 from TTS.tts.utils.helpers import sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.vc.configs.freevc_config import FreeVCConfig
+from TTS.vc.layers.freevc.commons import init_weights, rand_slice_segments
+from TTS.vc.layers.freevc.mel_processing import mel_spectrogram_torch
+from TTS.vc.layers.freevc.speaker_encoder.speaker_encoder import SpeakerEncoder as SpeakerEncoderEx
+from TTS.vc.layers.freevc.wavlm import get_wavlm
 from TTS.vc.models.base_vc import BaseVC
-from TTS.vc.modules.freevc.commons import init_weights
-from TTS.vc.modules.freevc.mel_processing import mel_spectrogram_torch
-from TTS.vc.modules.freevc.speaker_encoder.speaker_encoder import SpeakerEncoder as SpeakerEncoderEx
-from TTS.vc.modules.freevc.wavlm import get_wavlm
-from TTS.vocoder.models.hifigan_generator import get_padding
+from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP
 
 logger = logging.getLogger(__name__)
 
@@ -103,7 +101,7 @@ def __init__(
         upsample_kernel_sizes,
         gin_channels=0,
     ):
-        super(Generator, self).__init__()
+        super().__init__()
         self.num_kernels = len(resblock_kernel_sizes)
         self.num_upsamples = len(upsample_rates)
         self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
@@ -164,78 +162,9 @@ def remove_weight_norm(self):
             remove_parametrizations(l, "weight")
 
 
-class DiscriminatorP(torch.nn.Module):
-    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
-        self.period = period
-        self.use_spectral_norm = use_spectral_norm
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
-                norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
-                norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
-                norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
-                norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
-            ]
-        )
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-
-    def forward(self, x):
-        fmap = []
-
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), "reflect")
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
-class DiscriminatorS(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super(DiscriminatorS, self).__init__()
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
-                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
-                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
-                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
-                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
-                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-            ]
-        )
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-
-    def forward(self, x):
-        fmap = []
-
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-
-        return x, fmap
-
-
 class MultiPeriodDiscriminator(torch.nn.Module):
     def __init__(self, use_spectral_norm=False):
-        super(MultiPeriodDiscriminator, self).__init__()
+        super().__init__()
         periods = [2, 3, 5, 7, 11]
 
         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
@@ -260,7 +189,7 @@ def forward(self, y, y_hat):
 
 class SpeakerEncoder(torch.nn.Module):
     def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
-        super(SpeakerEncoder, self).__init__()
+        super().__init__()
         self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
         self.linear = nn.Linear(model_hidden_size, model_embedding_size)
         self.relu = nn.ReLU()
@@ -303,7 +232,7 @@ def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
 class FreeVC(BaseVC):
     """
 
-    Papaer::
+    Paper::
         https://arxiv.org/abs/2210.15418#
 
     Paper Abstract::
@@ -376,15 +305,11 @@ def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
 
         self.wavlm = get_wavlm()
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def load_pretrained_speaker_encoder(self):
         """Load pretrained speaker encoder model as mentioned in the paper."""
         logger.info("Loading pretrained speaker encoder model ...")
         self.enc_spk_ex = SpeakerEncoderEx(
-            "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.pt", device=self.device
+            "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.pt"
         )
 
     def init_multispeaker(self, config: Coqpit):
@@ -405,15 +330,15 @@ def forward(
         self,
         c: torch.Tensor,
         spec: torch.Tensor,
-        g: Optional[torch.Tensor] = None,
-        mel: Optional[torch.Tensor] = None,
-        c_lengths: Optional[torch.Tensor] = None,
-        spec_lengths: Optional[torch.Tensor] = None,
-    ) -> Tuple[
+        g: torch.Tensor | None = None,
+        mel: torch.Tensor | None = None,
+        c_lengths: torch.Tensor | None = None,
+        spec_lengths: torch.Tensor | None = None,
+    ) -> tuple[
         torch.Tensor,
         torch.Tensor,
         torch.Tensor,
-        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+        tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
     ]:
         """
         Forward pass of the model.
@@ -454,13 +379,13 @@ def forward(
         z_p = self.flow(z, spec_mask, g=g)
 
         # Randomly slice z and compute o using dec
-        z_slice, ids_slice = commons.rand_slice_segments(z, spec_lengths, self.segment_size)
+        z_slice, ids_slice = rand_slice_segments(z, spec_lengths, self.segment_size)
         o = self.dec(z_slice, g=g)
 
         return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
 
-    @torch.no_grad()
-    def inference(self, c, g=None, mel=None, c_lengths=None):
+    @torch.inference_mode()
+    def inference(self, c, g=None, c_lengths=None):
         """
         Inference pass of the model
 
@@ -475,9 +400,6 @@ def inference(self, c, g=None, mel=None, c_lengths=None):
         """
         if c_lengths is None:
             c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
-        if not self.use_spk:
-            g = self.enc_spk.embed_utterance(mel)
-            g = g.unsqueeze(-1)
         z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
         z = self.flow(z_p, c_mask, g=g, reverse=True)
         o = self.dec(z * c_mask, g=g)
@@ -508,51 +430,52 @@ def load_audio(self, wav):
         return wav.float()
 
     @torch.inference_mode()
-    def voice_conversion(self, src, tgt):
+    def voice_conversion(self, src: str | torch.Tensor, tgt: list[str | torch.Tensor]):
         """
         Voice conversion pass of the model.
 
         Args:
             src (str or torch.Tensor): Source utterance.
-            tgt (str or torch.Tensor): Target utterance.
+            tgt (list of str or torch.Tensor): Target utterances.
 
         Returns:
             torch.Tensor: Output tensor.
         """
 
-        wav_tgt = self.load_audio(tgt).cpu().numpy()
-        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
-
-        if self.config.model_args.use_spk:
-            g_tgt = self.enc_spk_ex.embed_utterance(wav_tgt)
-            g_tgt = torch.from_numpy(g_tgt)[None, :, None].to(self.device)
-        else:
-            wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(self.device)
-            mel_tgt = mel_spectrogram_torch(
-                wav_tgt,
-                self.config.audio.filter_length,
-                self.config.audio.n_mel_channels,
-                self.config.audio.input_sample_rate,
-                self.config.audio.hop_length,
-                self.config.audio.win_length,
-                self.config.audio.mel_fmin,
-                self.config.audio.mel_fmax,
-            )
         # src
         wav_src = self.load_audio(src)
         c = self.extract_wavlm_features(wav_src[None, :])
 
-        if self.config.model_args.use_spk:
-            audio = self.inference(c, g=g_tgt)
-        else:
-            audio = self.inference(c, mel=mel_tgt.transpose(1, 2))
-        audio = audio[0][0].data.cpu().float().numpy()
-        return audio
+        # tgt
+        g_tgts = []
+        for tg in tgt:
+            wav_tgt = self.load_audio(tg).cpu().numpy()
+            wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+
+            if self.config.model_args.use_spk:
+                g_tgts.append(self.enc_spk_ex.embed_utterance(wav_tgt)[None, :, None])
+            else:
+                wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(self.device)
+                mel_tgt = mel_spectrogram_torch(
+                    wav_tgt,
+                    self.config.audio.filter_length,
+                    self.config.audio.n_mel_channels,
+                    self.config.audio.input_sample_rate,
+                    self.config.audio.hop_length,
+                    self.config.audio.win_length,
+                    self.config.audio.mel_fmin,
+                    self.config.audio.mel_fmax,
+                )
+                g_tgts.append(self.enc_spk.embed_utterance(mel_tgt.transpose(1, 2)).unsqueeze(-1))
+
+        g_tgt = torch.stack(g_tgts).mean(dim=0)
+        audio = self.inference(c, g=g_tgt)
+        return audio[0][0].data.cpu().float().numpy()
 
     def eval_step(): ...
 
     @staticmethod
-    def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: FreeVCConfig) -> "FreeVC":
         model = FreeVC(config)
         return model
 
diff --git a/TTS/vc/models/knnvc.py b/TTS/vc/models/knnvc.py
new file mode 100644
index 0000000000..c31f52e749
--- /dev/null
+++ b/TTS/vc/models/knnvc.py
@@ -0,0 +1,181 @@
+import logging
+import os
+from typing import Any, TypeAlias
+
+import torch
+import torch.nn.functional as F
+import torchaudio
+from coqpit import Coqpit
+
+from TTS.vc.configs.knnvc_config import KNNVCConfig
+from TTS.vc.layers.freevc.wavlm import get_wavlm
+from TTS.vc.models.base_vc import BaseVC
+
+logger = logging.getLogger(__name__)
+
+PathOrTensor: TypeAlias = str | os.PathLike[Any] | torch.Tensor
+
+
+class KNNVC(BaseVC):
+    """
+    Paper::
+        https://arxiv.org/abs/2305.18975
+
+    Paper Abstract::
+        Any-to-any voice conversion aims to transform source speech
+        into a target voice with just a few examples of the target speaker as a
+        reference. Recent methods produce convincing conversions, but at the cost of
+        increased complexity -- making results difficult to reproduce and build on.
+        Instead, we keep it simple. We propose k-nearest neighbors voice conversion
+        (kNN-VC): a straightforward yet effective method for any-to-any conversion.
+        First, we extract self-supervised representations of the source and reference
+        speech. To convert to the target speaker, we replace each frame of the source
+        representation with its nearest neighbor in the reference. Finally, a pretrained
+        vocoder synthesizes audio from the converted representation. Objective and
+        subjective evaluations show that kNN-VC improves speaker similarity with similar
+        intelligibility scores to existing methods.
+
+    Samples::
+        https://bshall.github.io/knn-vc
+
+    Original code::
+        https://github.com/bshall/knn-vc
+
+    Examples:
+        >>> from TTS.vc.configs.knnvc_config import KNNVCConfig
+        >>> from TTS.vc.models.knnvc import KNNVC
+        >>> config = KNNVCConfig()
+        >>> model = KNNVC(config)
+    """
+
+    def __init__(self, config: Coqpit):
+        super().__init__(config)
+        self.ssl_dim = self.args.ssl_dim
+        self.wavlm = get_wavlm()
+
+    @staticmethod
+    def init_from_config(config: KNNVCConfig) -> "KNNVC":
+        return KNNVC(config)
+
+    @torch.inference_mode()
+    def get_features(self, audio: PathOrTensor, vad_trigger_level=0) -> torch.Tensor:
+        """Return features for the given waveform with output shape (seq_len, dim).
+
+        Optionally perform VAD trimming on start/end with `vad_trigger_level`.
+        """
+        # load audio
+        if isinstance(audio, torch.Tensor):
+            x: torch.Tensor = audio
+            sr = self.config.audio.sample_rate
+            if x.dim() == 1:
+                x = x[None]
+        else:
+            x, sr = torchaudio.load(audio, normalize=True)
+
+        if not sr == self.config.audio.sample_rate:
+            logger.info("Resampling %d to %d in %s", sr, self.config.audio.sample_rate, audio)
+            x = torchaudio.functional.resample(x, orig_freq=sr, new_freq=self.config.audio.sample_rate)
+            sr = self.config.audio.sample_rate
+
+        # trim silence from front and back
+        if vad_trigger_level > 1e-3:
+            transform = torchaudio.transforms.Vad(sample_rate=sr, trigger_level=vad_trigger_level)
+            x_front_trim = transform(x)
+            waveform_reversed = torch.flip(x_front_trim, (-1,))
+            waveform_reversed_front_trim = transform(waveform_reversed)
+            x = torch.flip(waveform_reversed_front_trim, (-1,))
+
+        # extract the representation of each layer
+        wav_input_16khz = x.to(self.device)
+        features = self.wavlm.extract_features(
+            wav_input_16khz, output_layer=self.config.wavlm_layer, ret_layer_results=False
+        )[0]
+        return features.squeeze(0)
+
+    def get_matching_set(self, wavs: list[PathOrTensor], vad_trigger_level=7) -> torch.Tensor:
+        """Get concatenated wavlm features for the matching set using all waveforms in `wavs`.
+
+        Wavs are specified as either a list of paths or list of loaded waveform tensors of
+        shape (channels, T), assumed to be of 16kHz sample rate.
+        """
+        feats = []
+        for p in wavs:
+            feats.append(self.get_features(p, vad_trigger_level=vad_trigger_level))
+
+        feats = torch.concat(feats, dim=0).cpu()
+        return feats
+
+    @staticmethod
+    def fast_cosine_dist(source_feats: torch.Tensor, matching_pool: torch.Tensor) -> torch.Tensor:
+        """Like torch.cdist, but fixed dim=-1 and for cosine distance."""
+        source_norms = torch.norm(source_feats, p=2, dim=-1)
+        matching_norms = torch.norm(matching_pool, p=2, dim=-1)
+        dotprod = (
+            -(torch.cdist(source_feats[None], matching_pool[None], p=2)[0] ** 2)
+            + source_norms[:, None] ** 2
+            + matching_norms[None] ** 2
+        )
+        dotprod /= 2
+
+        dists = 1 - (dotprod / (source_norms[:, None] * matching_norms[None]))
+        return dists
+
+    @torch.inference_mode()
+    def match(
+        self,
+        query_seq: torch.Tensor,
+        matching_set: torch.Tensor,
+        synth_set: torch.Tensor | None = None,
+        topk: int | None = None,
+        target_duration: float | None = None,
+    ) -> torch.Tensor:
+        """Given `query_seq`, `matching_set`, and `synth_set` tensors of shape (N, dim), perform kNN regression matching
+        with k=`topk`.
+
+        Args:
+            `query_seq`: Tensor (N1, dim) of the input/source query features.
+            `matching_set`: Tensor (N2, dim) of the matching set used as the 'training set' for the kNN algorithm.
+            `synth_set`: optional Tensor (N2, dim) corresponding to the matching set. We use the matching set to assign
+                         each query vector to a vector in the matching set, and then use the corresponding vector from
+                         the synth set during HiFiGAN synthesis.
+                         By default, and for best performance, this should be identical to the matching set.
+            `topk`: k in the kNN -- the number of nearest neighbors to average over.
+            `target_duration`: if set to a float, interpolate waveform duration to be equal to this value in seconds.
+
+        Returns:
+            - converted features (1, N, dim)
+        """
+        if topk is None:
+            topk = self.config.topk
+        synth_set = matching_set.to(self.device) if synth_set is None else synth_set.to(self.device)
+        matching_set = matching_set.to(self.device)
+        query_seq = query_seq.to(self.device)
+
+        if target_duration is not None:
+            target_samples = int(target_duration * self.config.audio.sample_rate)
+            scale_factor = (target_samples / self.hop_length) / query_seq.shape[0]  # n_targ_feats / n_input_feats
+            query_seq = F.interpolate(query_seq.T[None], scale_factor=scale_factor, mode="linear")[0].T
+
+        dists = self.fast_cosine_dist(query_seq, matching_set)
+        best = dists.topk(k=topk, largest=False, dim=-1)
+        out_feats = synth_set[best.indices].mean(dim=1)
+        return out_feats.unsqueeze(0)
+
+    def load_checkpoint(self, vc_config: KNNVCConfig, _vc_checkpoint: str | os.PathLike[Any]) -> None:
+        """kNN-VC does not use checkpoints."""
+
+    def forward(self) -> None: ...
+    def inference(self) -> None: ...
+
+    @torch.inference_mode()
+    def voice_conversion(
+        self,
+        source: PathOrTensor,
+        target: list[PathOrTensor],
+        topk: int | None = None,
+    ) -> torch.Tensor:
+        if not isinstance(target, list):
+            target = [target]
+        source_features = self.get_features(source)
+        matching_set = self.get_matching_set(target)
+        return self.match(source_features, matching_set, topk=topk)
diff --git a/TTS/vc/models/openvoice.py b/TTS/vc/models/openvoice.py
new file mode 100644
index 0000000000..1049a580c7
--- /dev/null
+++ b/TTS/vc/models/openvoice.py
@@ -0,0 +1,320 @@
+import json
+import logging
+import os
+from collections.abc import Mapping
+from pathlib import Path
+from typing import Any
+
+import librosa
+import numpy as np
+import numpy.typing as npt
+import torch
+from coqpit import Coqpit
+from torch import nn
+from torch.nn import functional as F
+from trainer.io import load_fsspec
+
+from TTS.tts.layers.vits.networks import PosteriorEncoder
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.utils.audio.torch_transforms import wav_to_spec
+from TTS.vc.configs.openvoice_config import OpenVoiceConfig
+from TTS.vc.models.base_vc import BaseVC
+from TTS.vc.models.freevc import Generator, ResidualCouplingBlock
+
+logger = logging.getLogger(__name__)
+
+
+class ReferenceEncoder(nn.Module):
+    """NN module creating a fixed size prosody embedding from a spectrogram.
+
+    inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
+    outputs: [batch_size, embedding_dim]
+    """
+
+    def __init__(self, spec_channels: int, embedding_dim: int = 0, layernorm: bool = True) -> None:
+        super().__init__()
+        self.spec_channels = spec_channels
+        ref_enc_filters = [32, 32, 64, 64, 128, 128]
+        K = len(ref_enc_filters)
+        filters = [1] + ref_enc_filters
+        convs = [
+            torch.nn.utils.parametrizations.weight_norm(
+                nn.Conv2d(
+                    in_channels=filters[i],
+                    out_channels=filters[i + 1],
+                    kernel_size=(3, 3),
+                    stride=(2, 2),
+                    padding=(1, 1),
+                )
+            )
+            for i in range(K)
+        ]
+        self.convs = nn.ModuleList(convs)
+
+        out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
+        self.gru = nn.GRU(
+            input_size=ref_enc_filters[-1] * out_channels,
+            hidden_size=256 // 2,
+            batch_first=True,
+        )
+        self.proj = nn.Linear(128, embedding_dim)
+        self.layernorm = nn.LayerNorm(self.spec_channels) if layernorm else None
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        N = inputs.size(0)
+
+        out = inputs.view(N, 1, -1, self.spec_channels)  # [N, 1, Ty, n_freqs]
+        if self.layernorm is not None:
+            out = self.layernorm(out)
+
+        for conv in self.convs:
+            out = conv(out)
+            out = F.relu(out)  # [N, 128, Ty//2^K, n_mels//2^K]
+
+        out = out.transpose(1, 2)  # [N, Ty//2^K, 128, n_mels//2^K]
+        T = out.size(1)
+        N = out.size(0)
+        out = out.contiguous().view(N, T, -1)  # [N, Ty//2^K, 128*n_mels//2^K]
+
+        self.gru.flatten_parameters()
+        _memory, out = self.gru(out)  # out --- [1, N, 128]
+
+        return self.proj(out.squeeze(0))
+
+    def calculate_channels(self, L: int, kernel_size: int, stride: int, pad: int, n_convs: int) -> int:
+        for _ in range(n_convs):
+            L = (L - kernel_size + 2 * pad) // stride + 1
+        return L
+
+
+class OpenVoice(BaseVC):
+    """
+    OpenVoice voice conversion model (inference only).
+
+    Source: https://github.com/myshell-ai/OpenVoice
+    Paper: https://arxiv.org/abs/2312.01479
+
+    Paper abstract:
+    We introduce OpenVoice, a versatile voice cloning approach that requires
+    only a short audio clip from the reference speaker to replicate their voice and
+    generate speech in multiple languages. OpenVoice represents a significant
+    advancement in addressing the following open challenges in the field: 1)
+    Flexible Voice Style Control. OpenVoice enables granular control over voice
+    styles, including emotion, accent, rhythm, pauses, and intonation, in addition
+    to replicating the tone color of the reference speaker. The voice styles are not
+    directly copied from and constrained by the style of the reference speaker.
+    Previous approaches lacked the ability to flexibly manipulate voice styles after
+    cloning. 2) Zero-Shot Cross-Lingual Voice Cloning. OpenVoice achieves zero-shot
+    cross-lingual voice cloning for languages not included in the massive-speaker
+    training set. Unlike previous approaches, which typically require extensive
+    massive-speaker multi-lingual (MSML) dataset for all languages, OpenVoice can
+    clone voices into a new language without any massive-speaker training data for
+    that language. OpenVoice is also computationally efficient, costing tens of
+    times less than commercially available APIs that offer even inferior
+    performance. To foster further research in the field, we have made the source
+    code and trained model publicly accessible. We also provide qualitative results
+    in our demo website. Prior to its public release, our internal version of
+    OpenVoice was used tens of millions of times by users worldwide between May and
+    October 2023, serving as the backend of MyShell.
+    """
+
+    def __init__(self, config: Coqpit, speaker_manager: SpeakerManager | None = None) -> None:
+        super().__init__(config, None, speaker_manager, None)
+
+        self.init_multispeaker(config)
+
+        self.zero_g = self.args.zero_g
+        self.inter_channels = self.args.inter_channels
+        self.hidden_channels = self.args.hidden_channels
+        self.filter_channels = self.args.filter_channels
+        self.n_heads = self.args.n_heads
+        self.n_layers = self.args.n_layers
+        self.kernel_size = self.args.kernel_size
+        self.p_dropout = self.args.p_dropout
+        self.resblock = self.args.resblock
+        self.resblock_kernel_sizes = self.args.resblock_kernel_sizes
+        self.resblock_dilation_sizes = self.args.resblock_dilation_sizes
+        self.upsample_rates = self.args.upsample_rates
+        self.upsample_initial_channel = self.args.upsample_initial_channel
+        self.upsample_kernel_sizes = self.args.upsample_kernel_sizes
+        self.n_layers_q = self.args.n_layers_q
+        self.use_spectral_norm = self.args.use_spectral_norm
+        self.gin_channels = self.args.gin_channels
+        self.tau = self.args.tau
+
+        self.spec_channels = config.audio.fft_size // 2 + 1
+
+        self.dec = Generator(
+            self.inter_channels,
+            self.resblock,
+            self.resblock_kernel_sizes,
+            self.resblock_dilation_sizes,
+            self.upsample_rates,
+            self.upsample_initial_channel,
+            self.upsample_kernel_sizes,
+            gin_channels=self.gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            self.spec_channels,
+            self.inter_channels,
+            self.hidden_channels,
+            kernel_size=5,
+            dilation_rate=1,
+            num_layers=16,
+            cond_channels=self.gin_channels,
+        )
+
+        self.flow = ResidualCouplingBlock(
+            self.inter_channels,
+            self.hidden_channels,
+            kernel_size=5,
+            dilation_rate=1,
+            n_layers=4,
+            gin_channels=self.gin_channels,
+        )
+
+        self.ref_enc = ReferenceEncoder(self.spec_channels, self.gin_channels)
+
+    @staticmethod
+    def init_from_config(config: OpenVoiceConfig) -> "OpenVoice":
+        return OpenVoice(config)
+
+    def init_multispeaker(self, config: Coqpit, data: list[Any] | None = None) -> None:
+        """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
+        or with external `d_vectors` computed from a speaker encoder model.
+
+        You must provide a `speaker_manager` at initialization to set up the multi-speaker modules.
+
+        Args:
+            config (Coqpit): Model configuration.
+            data (list, optional): Dataset items to infer number of speakers. Defaults to None.
+        """
+        self.num_spks = config.num_speakers
+        if self.speaker_manager:
+            self.num_spks = self.speaker_manager.num_speakers
+
+    def load_checkpoint(
+        self,
+        config: OpenVoiceConfig,
+        checkpoint_path: str | os.PathLike[Any],
+        eval: bool = False,
+        strict: bool = True,
+        cache: bool = False,
+    ) -> None:
+        """Map from OpenVoice's config structure."""
+        config_path = Path(checkpoint_path).parent / "config.json"
+        with open(config_path, encoding="utf-8") as f:
+            config_org = json.load(f)
+        self.config.audio.input_sample_rate = config_org["data"]["sampling_rate"]
+        self.config.audio.output_sample_rate = config_org["data"]["sampling_rate"]
+        self.config.audio.fft_size = config_org["data"]["filter_length"]
+        self.config.audio.hop_length = config_org["data"]["hop_length"]
+        self.config.audio.win_length = config_org["data"]["win_length"]
+        state = load_fsspec(str(checkpoint_path), map_location=torch.device("cpu"), cache=cache)
+        self.load_state_dict(state["model"], strict=strict)
+        if eval:
+            self.eval()
+
+    def forward(self) -> None: ...
+    def train_step(self) -> None: ...
+    def eval_step(self) -> None: ...
+
+    @staticmethod
+    def _set_x_lengths(x: torch.Tensor, aux_input: Mapping[str, torch.Tensor | None]) -> torch.Tensor:
+        if "x_lengths" in aux_input and aux_input["x_lengths"] is not None:
+            return aux_input["x_lengths"]
+        return torch.tensor(x.shape[-1:]).to(x.device)
+
+    @torch.inference_mode()
+    def inference(
+        self,
+        x: torch.Tensor,
+        aux_input: Mapping[str, torch.Tensor | None] = {"x_lengths": None, "g_src": None, "g_tgt": None},
+    ) -> dict[str, torch.Tensor]:
+        """
+        Inference pass of the model
+
+        Args:
+            x (torch.Tensor): Input tensor. Shape: (batch_size, c_seq_len).
+            x_lengths (torch.Tensor): Lengths of the input tensor. Shape: (batch_size,).
+            g_src (torch.Tensor): Source speaker embedding tensor. Shape: (batch_size, spk_emb_dim).
+            g_tgt (torch.Tensor): Target speaker embedding tensor. Shape: (batch_size, spk_emb_dim).
+
+        Returns:
+            o_hat: Output spectrogram tensor. Shape: (batch_size, spec_seq_len, spec_dim).
+            x_mask: Spectrogram mask. Shape: (batch_size, spec_seq_len).
+            (z, z_p, z_hat): A tuple of latent variables.
+        """
+        x_lengths = self._set_x_lengths(x, aux_input)
+        if "g_src" in aux_input and aux_input["g_src"] is not None:
+            g_src = aux_input["g_src"]
+        else:
+            raise ValueError("aux_input must define g_src")
+        if "g_tgt" in aux_input and aux_input["g_tgt"] is not None:
+            g_tgt = aux_input["g_tgt"]
+        else:
+            raise ValueError("aux_input must define g_tgt")
+        z, _m_q, _logs_q, y_mask = self.enc_q(
+            x, x_lengths, g=g_src if not self.zero_g else torch.zeros_like(g_src), tau=self.tau
+        )
+        z_p = self.flow(z, y_mask, g=g_src)
+        z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
+        o_hat = self.dec(z_hat * y_mask, g=g_tgt if not self.zero_g else torch.zeros_like(g_tgt))
+        return {
+            "model_outputs": o_hat,
+            "y_mask": y_mask,
+            "z": z,
+            "z_p": z_p,
+            "z_hat": z_hat,
+        }
+
+    def load_audio(self, wav: str | npt.NDArray[np.float32] | torch.Tensor | list[float]) -> torch.Tensor:
+        """Read and format the input audio."""
+        if isinstance(wav, str):
+            out = torch.from_numpy(librosa.load(wav, sr=self.config.audio.input_sample_rate)[0])
+        elif isinstance(wav, np.ndarray):
+            out = torch.from_numpy(wav)
+        elif isinstance(wav, list):
+            out = torch.from_numpy(np.array(wav))
+        else:
+            out = wav
+        return out.to(self.device).float()
+
+    def extract_se(self, audio: str | torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        y = self.load_audio(audio)
+        y = y.to(self.device)
+        y = y.unsqueeze(0)
+        spec = wav_to_spec(
+            y,
+            n_fft=self.config.audio.fft_size,
+            hop_length=self.config.audio.hop_length,
+            win_length=self.config.audio.win_length,
+            center=False,
+        ).to(self.device)
+        with torch.no_grad():
+            g = self.ref_enc(spec.transpose(1, 2)).unsqueeze(-1)
+
+        return g, spec
+
+    @torch.inference_mode()
+    def voice_conversion(self, src: str | torch.Tensor, tgt: list[str | torch.Tensor]) -> npt.NDArray[np.float32]:
+        """
+        Voice conversion pass of the model.
+
+        Args:
+            src (str or torch.Tensor): Source utterance.
+            tgt (list of str or torch.Tensor): Target utterance.
+
+        Returns:
+            Output numpy array.
+        """
+        src_se, src_spec = self.extract_se(src)
+        tgt_ses = []
+        for tg in tgt:
+            tgt_se, _ = self.extract_se(tg)
+            tgt_ses.append(tgt_se)
+        tgt_se = torch.stack(tgt_ses).mean(dim=0)
+
+        aux_input = {"g_src": src_se, "g_tgt": tgt_se}
+        audio = self.inference(src_spec, aux_input)
+        return audio["model_outputs"][0, 0].data.cpu().float().numpy()
diff --git a/TTS/vc/modules/freevc/mel_processing.py b/TTS/vc/modules/freevc/mel_processing.py
deleted file mode 100644
index a3e251891a..0000000000
--- a/TTS/vc/modules/freevc/mel_processing.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import logging
-
-import torch
-import torch.utils.data
-from librosa.filters import mel as librosa_mel_fn
-
-logger = logging.getLogger(__name__)
-
-MAX_WAV_VALUE = 32768.0
-
-
-def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
-    """
-    PARAMS
-    ------
-    C: compression factor
-    """
-    return torch.log(torch.clamp(x, min=clip_val) * C)
-
-
-def dynamic_range_decompression_torch(x, C=1):
-    """
-    PARAMS
-    ------
-    C: compression factor used to compress
-    """
-    return torch.exp(x) / C
-
-
-def spectral_normalize_torch(magnitudes):
-    output = dynamic_range_compression_torch(magnitudes)
-    return output
-
-
-def spectral_de_normalize_torch(magnitudes):
-    output = dynamic_range_decompression_torch(magnitudes)
-    return output
-
-
-mel_basis = {}
-hann_window = {}
-
-
-def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
-    if torch.min(y) < -1.0:
-        logger.info("Min value is: %.3f", torch.min(y))
-    if torch.max(y) > 1.0:
-        logger.info("Max value is: %.3f", torch.max(y))
-
-    global hann_window
-    dtype_device = str(y.dtype) + "_" + str(y.device)
-    wnsize_dtype_device = str(win_size) + "_" + dtype_device
-    if wnsize_dtype_device not in hann_window:
-        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
-
-    y = torch.nn.functional.pad(
-        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
-    )
-    y = y.squeeze(1)
-
-    spec = torch.view_as_real(
-        torch.stft(
-            y,
-            n_fft,
-            hop_length=hop_size,
-            win_length=win_size,
-            window=hann_window[wnsize_dtype_device],
-            center=center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-    )
-
-    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
-    return spec
-
-
-def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
-    global mel_basis
-    dtype_device = str(spec.dtype) + "_" + str(spec.device)
-    fmax_dtype_device = str(fmax) + "_" + dtype_device
-    if fmax_dtype_device not in mel_basis:
-        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
-        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
-    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
-    spec = spectral_normalize_torch(spec)
-    return spec
-
-
-def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
-    if torch.min(y) < -1.0:
-        logger.info("Min value is: %.3f", torch.min(y))
-    if torch.max(y) > 1.0:
-        logger.info("Max value is: %.3f", torch.max(y))
-
-    global mel_basis, hann_window
-    dtype_device = str(y.dtype) + "_" + str(y.device)
-    fmax_dtype_device = str(fmax) + "_" + dtype_device
-    wnsize_dtype_device = str(win_size) + "_" + dtype_device
-    if fmax_dtype_device not in mel_basis:
-        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
-        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
-    if wnsize_dtype_device not in hann_window:
-        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
-
-    y = torch.nn.functional.pad(
-        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
-    )
-    y = y.squeeze(1)
-
-    spec = torch.view_as_real(
-        torch.stft(
-            y,
-            n_fft,
-            hop_length=hop_size,
-            win_length=win_size,
-            window=hann_window[wnsize_dtype_device],
-            center=center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-    )
-
-    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
-
-    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
-    spec = spectral_normalize_torch(spec)
-
-    return spec
diff --git a/TTS/vocoder/configs/hifigan_config.py b/TTS/vocoder/configs/hifigan_config.py
index 9a102f0c89..60dde496b2 100644
--- a/TTS/vocoder/configs/hifigan_config.py
+++ b/TTS/vocoder/configs/hifigan_config.py
@@ -5,7 +5,7 @@
 
 @dataclass
 class HifiganConfig(BaseGANVocoderConfig):
-    """Defines parameters for FullBand MelGAN vocoder.
+    """Defines parameters for HifiGAN vocoder.
 
     Example:
 
diff --git a/TTS/vocoder/configs/multiband_melgan_config.py b/TTS/vocoder/configs/multiband_melgan_config.py
index 763113537f..2139f47b0e 100644
--- a/TTS/vocoder/configs/multiband_melgan_config.py
+++ b/TTS/vocoder/configs/multiband_melgan_config.py
@@ -121,7 +121,7 @@ class MultibandMelganConfig(BaseGANVocoderConfig):
     pad_short: int = 2000
     use_noise_augment: bool = False
     use_cache: bool = True
-    steps_to_start_discriminator: bool = 200000
+    steps_to_start_discriminator: int = 200000
 
     # LOSS PARAMETERS - overrides
     use_stft_loss: bool = True
diff --git a/TTS/vocoder/configs/shared_configs.py b/TTS/vocoder/configs/shared_configs.py
index a558cfcabb..548505a54d 100644
--- a/TTS/vocoder/configs/shared_configs.py
+++ b/TTS/vocoder/configs/shared_configs.py
@@ -168,7 +168,7 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
     target_loss: str = "loss_0"  # loss value to pick the best model to save after each epoch
 
     # optimizer
-    grad_clip: float = field(default_factory=lambda: [5, 5])
+    grad_clip: float | list[float] = field(default_factory=lambda: [5, 5])
     lr_gen: float = 0.0002  # Initial learning rate.
     lr_disc: float = 0.0002  # Initial learning rate.
     lr_scheduler_gen: str = "ExponentialLR"  # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
@@ -178,5 +178,5 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
     scheduler_after_epoch: bool = True
 
     use_pqmf: bool = False  # enable/disable using pqmf for multi-band training. (Multi-band MelGAN)
-    steps_to_start_discriminator = 0  # start training the discriminator after this number of steps.
+    steps_to_start_discriminator: int = 0  # start training the discriminator after this number of steps.
     diff_samples_for_G_and_D: bool = False  # use different samples for G and D training steps.
diff --git a/TTS/vocoder/configs/univnet_config.py b/TTS/vocoder/configs/univnet_config.py
index 67f324cfce..85662831ee 100644
--- a/TTS/vocoder/configs/univnet_config.py
+++ b/TTS/vocoder/configs/univnet_config.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import Dict
 
 from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig
 
@@ -96,7 +95,7 @@ class UnivnetConfig(BaseGANVocoderConfig):
     # model specific params
     discriminator_model: str = "univnet_discriminator"
     generator_model: str = "univnet_generator"
-    generator_model_params: Dict = field(
+    generator_model_params: dict = field(
         default_factory=lambda: {
             "in_channels": 64,
             "out_channels": 1,
@@ -121,7 +120,7 @@ class UnivnetConfig(BaseGANVocoderConfig):
 
     # loss weights - overrides
     stft_loss_weight: float = 2.5
-    stft_loss_params: Dict = field(
+    stft_loss_params: dict = field(
         default_factory=lambda: {
             "n_ffts": [1024, 2048, 512],
             "hop_lengths": [120, 240, 50],
@@ -133,7 +132,7 @@ class UnivnetConfig(BaseGANVocoderConfig):
     hinge_G_loss_weight: float = 0
     feat_match_loss_weight: float = 0
     l1_spec_loss_weight: float = 0
-    l1_spec_loss_params: Dict = field(
+    l1_spec_loss_params: dict = field(
         default_factory=lambda: {
             "use_mel": True,
             "sample_rate": 22050,
@@ -153,7 +152,7 @@ class UnivnetConfig(BaseGANVocoderConfig):
     # lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1})
     lr_scheduler_disc: str = None  # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
     # lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1})
-    optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.5, 0.9], "weight_decay": 0.0})
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.5, 0.9], "weight_decay": 0.0})
     steps_to_start_discriminator: int = 200000
 
     def __post_init__(self):
diff --git a/TTS/vocoder/datasets/__init__.py b/TTS/vocoder/datasets/__init__.py
index 04462817a8..cef6a50b05 100644
--- a/TTS/vocoder/datasets/__init__.py
+++ b/TTS/vocoder/datasets/__init__.py
@@ -1,5 +1,3 @@
-from typing import List
-
 from coqpit import Coqpit
 from torch.utils.data import Dataset
 
@@ -10,7 +8,7 @@
 from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
 
 
-def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List) -> Dataset:
+def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: list) -> Dataset:
     if config.model.lower() in "gan":
         dataset = GANDataset(
             ap=ap,
diff --git a/TTS/vocoder/datasets/gan_dataset.py b/TTS/vocoder/datasets/gan_dataset.py
index 0806c0d496..076545f8a2 100644
--- a/TTS/vocoder/datasets/gan_dataset.py
+++ b/TTS/vocoder/datasets/gan_dataset.py
@@ -32,7 +32,7 @@ def __init__(
         super().__init__()
         self.ap = ap
         self.item_list = items
-        self.compute_feat = not isinstance(items[0], (tuple, list))
+        self.compute_feat = not isinstance(items[0], tuple | list)
         self.seq_len = seq_len
         self.hop_len = hop_len
         self.pad_short = pad_short
@@ -128,9 +128,9 @@ def load_item(self, idx):
         # correct the audio length wrt padding applied in stft
         audio = np.pad(audio, (0, self.hop_len), mode="edge")
         audio = audio[: mel.shape[-1] * self.hop_len]
-        assert (
-            mel.shape[-1] * self.hop_len == audio.shape[-1]
-        ), f" [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}"
+        assert mel.shape[-1] * self.hop_len == audio.shape[-1], (
+            f" [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}"
+        )
 
         audio = torch.from_numpy(audio).float().unsqueeze(0)
         mel = torch.from_numpy(mel).float().squeeze(0)
diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py
index 6f34bccb7c..435330bebe 100644
--- a/TTS/vocoder/datasets/wavegrad_dataset.py
+++ b/TTS/vocoder/datasets/wavegrad_dataset.py
@@ -2,7 +2,6 @@
 import os
 import random
 from multiprocessing import Manager
-from typing import List, Tuple
 
 import numpy as np
 import torch
@@ -65,7 +64,7 @@ def __getitem__(self, idx):
         item = self.load_item(idx)
         return item
 
-    def load_test_samples(self, num_samples: int) -> List[Tuple]:
+    def load_test_samples(self, num_samples: int) -> list[tuple]:
         """Return test samples.
 
         Args:
@@ -103,9 +102,9 @@ def load_item(self, idx):
                     audio = np.pad(
                         audio, (0, self.seq_len + self.pad_short - len(audio)), mode="constant", constant_values=0.0
                     )
-                assert (
-                    audio.shape[-1] >= self.seq_len + self.pad_short
-                ), f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}"
+                assert audio.shape[-1] >= self.seq_len + self.pad_short, (
+                    f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}"
+                )
 
             # correct the audio length wrt hop length
             p = (audio.shape[-1] // self.hop_len + 1) * self.hop_len - audio.shape[-1]
diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py
index 4c4f5c48df..ffb71177c5 100644
--- a/TTS/vocoder/datasets/wavernn_dataset.py
+++ b/TTS/vocoder/datasets/wavernn_dataset.py
@@ -18,7 +18,7 @@ class WaveRNNDataset(Dataset):
     def __init__(self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, return_segments=True):
         super().__init__()
         self.ap = ap
-        self.compute_feat = not isinstance(items[0], (tuple, list))
+        self.compute_feat = not isinstance(items[0], tuple | list)
         self.item_list = items
         self.seq_len = seq_len
         self.hop_len = hop_len
diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py
index 8d4dd725ef..81a1f30884 100644
--- a/TTS/vocoder/layers/losses.py
+++ b/TTS/vocoder/layers/losses.py
@@ -1,5 +1,3 @@
-from typing import Dict, Union
-
 import torch
 from torch import nn
 from torch.nn import functional as F
@@ -226,9 +224,9 @@ class GeneratorLoss(nn.Module):
 
     def __init__(self, C):
         super().__init__()
-        assert not (
-            C.use_mse_gan_loss and C.use_hinge_gan_loss
-        ), " [!] Cannot use HingeGANLoss and MSEGANLoss together."
+        assert not (C.use_mse_gan_loss and C.use_hinge_gan_loss), (
+            " [!] Cannot use HingeGANLoss and MSEGANLoss together."
+        )
 
         self.use_stft_loss = C.use_stft_loss if "use_stft_loss" in C else False
         self.use_subband_stft_loss = C.use_subband_stft_loss if "use_subband_stft_loss" in C else False
@@ -313,9 +311,9 @@ class DiscriminatorLoss(nn.Module):
 
     def __init__(self, C):
         super().__init__()
-        assert not (
-            C.use_mse_gan_loss and C.use_hinge_gan_loss
-        ), " [!] Cannot use HingeGANLoss and MSEGANLoss together."
+        assert not (C.use_mse_gan_loss and C.use_hinge_gan_loss), (
+            " [!] Cannot use HingeGANLoss and MSEGANLoss together."
+        )
 
         self.use_mse_gan_loss = C.use_mse_gan_loss
         self.use_hinge_gan_loss = C.use_hinge_gan_loss
@@ -352,7 +350,7 @@ def forward(self, scores_fake, scores_real):
 
 
 class WaveRNNLoss(nn.Module):
-    def __init__(self, wave_rnn_mode: Union[str, int]):
+    def __init__(self, wave_rnn_mode: str | int):
         super().__init__()
         if wave_rnn_mode == "mold":
             self.loss_func = discretized_mix_logistic_loss
@@ -363,6 +361,6 @@ def __init__(self, wave_rnn_mode: Union[str, int]):
         else:
             raise ValueError(" [!] Unknown mode for Wavernn.")
 
-    def forward(self, y_hat, y) -> Dict:
+    def forward(self, y_hat, y) -> dict:
         loss = self.loss_func(y_hat, y)
         return {"loss": loss}
diff --git a/TTS/vocoder/layers/lvc_block.py b/TTS/vocoder/layers/lvc_block.py
index 8913a1132e..ab1a56e7fc 100644
--- a/TTS/vocoder/layers/lvc_block.py
+++ b/TTS/vocoder/layers/lvc_block.py
@@ -175,9 +175,9 @@ def location_variable_convolution(x, kernel, bias, dilation, hop_size):
         batch, _, in_length = x.shape
         batch, _, out_channels, kernel_size, kernel_length = kernel.shape
 
-        assert in_length == (
-            kernel_length * hop_size
-        ), f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}"
+        assert in_length == (kernel_length * hop_size), (
+            f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}"
+        )
 
         padding = dilation * int((kernel_size - 1) / 2)
         x = F.pad(x, (padding, padding), "constant", 0)  # (batch, in_channels, in_length + 2*padding)
diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py
index 9f1512c6d4..187e7062e2 100644
--- a/TTS/vocoder/layers/wavegrad.py
+++ b/TTS/vocoder/layers/wavegrad.py
@@ -74,7 +74,7 @@ def shif_and_scale(x, scale, shift):
 class UBlock(nn.Module):
     def __init__(self, input_size, hidden_size, factor, dilation):
         super().__init__()
-        assert isinstance(dilation, (list, tuple))
+        assert isinstance(dilation, list | tuple)
         assert len(dilation) == 4
 
         self.factor = factor
diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py
index 7a1716f16d..481d234a54 100644
--- a/TTS/vocoder/models/__init__.py
+++ b/TTS/vocoder/models/__init__.py
@@ -4,15 +4,14 @@
 
 from coqpit import Coqpit
 
-logger = logging.getLogger(__name__)
-
+from TTS.utils.generic_utils import to_camel
+from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig, BaseVocoderConfig
+from TTS.vocoder.models.base_vocoder import BaseVocoder
 
-def to_camel(text):
-    text = text.capitalize()
-    return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
+logger = logging.getLogger(__name__)
 
 
-def setup_model(config: Coqpit):
+def setup_model(config: BaseVocoderConfig) -> BaseVocoder:
     """Load models directly from configuration."""
     if "discriminator_model" in config and "generator_model" in config:
         MyModel = importlib.import_module("TTS.vocoder.models.gan")
@@ -29,19 +28,20 @@ def setup_model(config: Coqpit):
             try:
                 MyModel = getattr(MyModel, to_camel(config.model))
             except ModuleNotFoundError as e:
-                raise ValueError(f"Model {config.model} not exist!") from e
+                raise ValueError(f"Model {config.model} does not exist!") from e
     logger.info("Vocoder model: %s", config.model)
     return MyModel.init_from_config(config)
 
 
-def setup_generator(c):
+def setup_generator(c: BaseGANVocoderConfig):
     """TODO: use config object as arguments"""
     logger.info("Generator model: %s", c.generator_model)
     MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower())
     MyModel = getattr(MyModel, to_camel(c.generator_model))
     # this is to preserve the Wavernn class name (instead of Wavernn)
     if c.generator_model.lower() in "hifigan_generator":
-        model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params)
+        c.generator_model_params["in_channels"] = c.generator_model_params.get("in_channels", c.audio["num_mels"])
+        model = MyModel(out_channels=1, **c.generator_model_params)
     elif c.generator_model.lower() in "melgan_generator":
         model = MyModel(
             in_channels=c.audio["num_mels"],
@@ -97,8 +97,8 @@ def setup_generator(c):
     return model
 
 
-def setup_discriminator(c):
-    """TODO: use config objekt as arguments"""
+def setup_discriminator(c: BaseGANVocoderConfig):
+    """TODO: use config object as arguments"""
     logger.info("Discriminator model: %s", c.discriminator_model)
     if "parallel_wavegan" in c.discriminator_model:
         MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator")
@@ -107,7 +107,7 @@ def setup_discriminator(c):
     MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower()))
     if c.discriminator_model in "hifigan_discriminator":
         model = MyModel()
-    if c.discriminator_model in "random_window_discriminator":
+    elif c.discriminator_model in "random_window_discriminator":
         model = MyModel(
             cond_channels=c.audio["num_mels"],
             hop_length=c.audio["hop_length"],
@@ -116,7 +116,7 @@ def setup_discriminator(c):
             cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"],
             window_sizes=c.discriminator_model_params["window_sizes"],
         )
-    if c.discriminator_model in "melgan_multiscale_discriminator":
+    elif c.discriminator_model in "melgan_multiscale_discriminator":
         model = MyModel(
             in_channels=1,
             out_channels=1,
@@ -125,7 +125,7 @@ def setup_discriminator(c):
             max_channels=c.discriminator_model_params["max_channels"],
             downsample_factors=c.discriminator_model_params["downsample_factors"],
         )
-    if c.discriminator_model == "residual_parallel_wavegan_discriminator":
+    elif c.discriminator_model == "residual_parallel_wavegan_discriminator":
         model = MyModel(
             in_channels=1,
             out_channels=1,
@@ -140,7 +140,7 @@ def setup_discriminator(c):
             nonlinear_activation="LeakyReLU",
             nonlinear_activation_params={"negative_slope": 0.2},
         )
-    if c.discriminator_model == "parallel_wavegan_discriminator":
+    elif c.discriminator_model == "parallel_wavegan_discriminator":
         model = MyModel(
             in_channels=1,
             out_channels=1,
@@ -152,6 +152,8 @@ def setup_discriminator(c):
             nonlinear_activation_params={"negative_slope": 0.2},
             bias=True,
         )
-    if c.discriminator_model == "univnet_discriminator":
+    elif c.discriminator_model == "univnet_discriminator":
         model = MyModel()
+    else:
+        raise NotImplementedError(f"Model {c.discriminator_model} not implemented!")
     return model
diff --git a/TTS/vocoder/models/fullband_melgan_generator.py b/TTS/vocoder/models/fullband_melgan_generator.py
index ee25559af0..292d3323bb 100644
--- a/TTS/vocoder/models/fullband_melgan_generator.py
+++ b/TTS/vocoder/models/fullband_melgan_generator.py
@@ -24,7 +24,7 @@ def __init__(
             num_res_blocks=num_res_blocks,
         )
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, cond_features):
         cond_features = cond_features.to(self.layers[1].weight.device)
         cond_features = torch.nn.functional.pad(
diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py
index 8792950a56..6abb2dc997 100644
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@@ -1,5 +1,4 @@
 from inspect import signature
-from typing import Dict, List, Tuple
 
 import numpy as np
 import torch
@@ -65,7 +64,7 @@ def inference(self, x: torch.Tensor) -> torch.Tensor:
         """
         return self.model_g.inference(x)
 
-    def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[Dict, Dict]:
+    def train_step(self, batch: dict, criterion: dict, optimizer_idx: int) -> tuple[dict, dict]:
         """Compute model outputs and the loss values. `optimizer_idx` selects the generator or the discriminator for
         network on the current pass.
 
@@ -185,7 +184,7 @@ def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[
             outputs = {"model_outputs": self.y_hat_g}
         return outputs, loss_dict
 
-    def _log(self, name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, Dict]:
+    def _log(self, name: str, ap: AudioProcessor, batch: dict, outputs: dict) -> tuple[dict, dict]:
         """Logging shared by the training and evaluation.
 
         Args:
@@ -205,22 +204,32 @@ def _log(self, name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tup
         return figures, audios
 
     def train_log(
-        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
-    ) -> Tuple[Dict, np.ndarray]:
+        self,
+        batch: dict,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
+    ) -> tuple[dict, np.ndarray]:
         """Call `_log()` for training."""
         figures, audios = self._log("eval", self.ap, batch, outputs)
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
-    @torch.no_grad()
-    def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]:
+    @torch.inference_mode()
+    def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> tuple[dict, dict]:
         """Call `train_step()` with `no_grad()`"""
         self.train_disc = True  # Avoid a bug in the Training with the missing discriminator loss
         return self.train_step(batch, criterion, optimizer_idx)
 
     def eval_log(
-        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
-    ) -> Tuple[Dict, np.ndarray]:
+        self,
+        batch: dict,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
+    ) -> tuple[dict, np.ndarray]:
         """Call `_log()` for evaluation."""
         figures, audios = self._log("eval", self.ap, batch, outputs)
         logger.eval_figures(steps, figures)
@@ -259,7 +268,7 @@ def on_train_step_start(self, trainer) -> None:
         """
         self.train_disc = trainer.total_steps_done >= self.config.steps_to_start_discriminator
 
-    def get_optimizer(self) -> List:
+    def get_optimizer(self) -> list:
         """Initiate and return the GAN optimizers based on the config parameters.
 
         It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator.
@@ -275,7 +284,7 @@ def get_optimizer(self) -> List:
         )
         return [optimizer2, optimizer1]
 
-    def get_lr(self) -> List:
+    def get_lr(self) -> list:
         """Set the initial learning rates for each optimizer.
 
         Returns:
@@ -283,7 +292,7 @@ def get_lr(self) -> List:
         """
         return [self.config.lr_disc, self.config.lr_gen]
 
-    def get_scheduler(self, optimizer) -> List:
+    def get_scheduler(self, optimizer) -> list:
         """Set the schedulers for each optimizer.
 
         Args:
@@ -297,7 +306,7 @@ def get_scheduler(self, optimizer) -> List:
         return [scheduler2, scheduler1]
 
     @staticmethod
-    def format_batch(batch: List) -> Dict:
+    def format_batch(batch: list) -> dict:
         """Format the batch for training.
 
         Args:
@@ -316,12 +325,12 @@ def format_batch(batch: List) -> Dict:
     def get_data_loader(  # pylint: disable=no-self-use, unused-argument
         self,
         config: Coqpit,
-        assets: Dict,
+        assets: dict,
         is_eval: True,
-        samples: List,
+        samples: list,
         verbose: bool,
         num_gpus: int,
-        rank: int = None,  # pylint: disable=unused-argument
+        rank: int | None = None,  # pylint: disable=unused-argument
     ):
         """Initiate and return the GAN dataloader.
 
diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py
index afdd59a859..308b12ab56 100644
--- a/TTS/vocoder/models/hifigan_generator.py
+++ b/TTS/vocoder/models/hifigan_generator.py
@@ -178,6 +178,8 @@ def __init__(
         conv_pre_weight_norm=True,
         conv_post_weight_norm=True,
         conv_post_bias=True,
+        cond_in_each_up_layer=False,
+        pre_linear=None,
     ):
         r"""HiFiGAN Generator with Multi-Receptive Field Fusion (MRF)
 
@@ -197,12 +199,17 @@ def __init__(
                 for each consecutive upsampling layer.
             upsample_factors (List[int]): upsampling factors (stride) for each upsampling layer.
             inference_padding (int): constant padding applied to the input at inference time. Defaults to 5.
+            pre_linear (int): If not None, add nn.Linear(pre_linear, in_channels) before the convolutions.
         """
         super().__init__()
         self.inference_padding = inference_padding
         self.num_kernels = len(resblock_kernel_sizes)
         self.num_upsamples = len(upsample_factors)
+        self.cond_in_each_up_layer = cond_in_each_up_layer
+
         # initial upsampling layers
+        if pre_linear is not None:
+            self.lin_pre = nn.Linear(pre_linear, in_channels)
         self.conv_pre = weight_norm(Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3))
         resblock = ResBlock1 if resblock_type == "1" else ResBlock2
         # upsampling layers
@@ -236,6 +243,12 @@ def __init__(
         if not conv_post_weight_norm:
             remove_parametrizations(self.conv_post, "weight")
 
+        if self.cond_in_each_up_layer:
+            self.conds = nn.ModuleList()
+            for i in range(len(self.ups)):
+                ch = upsample_initial_channel // (2 ** (i + 1))
+                self.conds.append(nn.Conv1d(cond_channels, ch, 1))
+
     def forward(self, x, g=None):
         """
         Args:
@@ -249,12 +262,19 @@ def forward(self, x, g=None):
             x: [B, C, T]
             Tensor: [B, 1, T]
         """
+        if hasattr(self, "lin_pre"):
+            x = self.lin_pre(x)
+            x = x.permute(0, 2, 1)
         o = self.conv_pre(x)
         if hasattr(self, "cond_layer"):
             o = o + self.cond_layer(g)
         for i in range(self.num_upsamples):
             o = F.leaky_relu(o, LRELU_SLOPE)
             o = self.ups[i](o)
+
+            if self.cond_in_each_up_layer:
+                o = o + self.conds[i](g)
+
             z_sum = None
             for j in range(self.num_kernels):
                 if z_sum is None:
@@ -267,7 +287,7 @@ def forward(self, x, g=None):
         o = torch.tanh(o)
         return o
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, c):
         """
         Args:
@@ -293,9 +313,7 @@ def remove_weight_norm(self):
         remove_parametrizations(self.conv_pre, "weight")
         remove_parametrizations(self.conv_post, "weight")
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
diff --git a/TTS/vocoder/models/melgan_generator.py b/TTS/vocoder/models/melgan_generator.py
index 03c971afa4..53ed700755 100644
--- a/TTS/vocoder/models/melgan_generator.py
+++ b/TTS/vocoder/models/melgan_generator.py
@@ -84,9 +84,7 @@ def remove_weight_norm(self):
                 except ValueError:
                     layer.remove_weight_norm()
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
diff --git a/TTS/vocoder/models/multiband_melgan_generator.py b/TTS/vocoder/models/multiband_melgan_generator.py
index 25d6590659..6eee712db3 100644
--- a/TTS/vocoder/models/multiband_melgan_generator.py
+++ b/TTS/vocoder/models/multiband_melgan_generator.py
@@ -32,7 +32,7 @@ def pqmf_analysis(self, x):
     def pqmf_synthesis(self, x):
         return self.pqmf_layer.synthesis(x)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, cond_features):
         cond_features = cond_features.to(self.layers[1].weight.device)
         cond_features = torch.nn.functional.pad(
diff --git a/TTS/vocoder/models/parallel_wavegan_discriminator.py b/TTS/vocoder/models/parallel_wavegan_discriminator.py
index 211d45d91c..02ad60e0ff 100644
--- a/TTS/vocoder/models/parallel_wavegan_discriminator.py
+++ b/TTS/vocoder/models/parallel_wavegan_discriminator.py
@@ -71,7 +71,7 @@ def forward(self, x):
 
     def apply_weight_norm(self):
         def _apply_weight_norm(m):
-            if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)):
+            if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d):
                 torch.nn.utils.parametrizations.weight_norm(m)
 
         self.apply(_apply_weight_norm)
@@ -174,7 +174,7 @@ def forward(self, x):
 
     def apply_weight_norm(self):
         def _apply_weight_norm(m):
-            if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)):
+            if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d):
                 torch.nn.utils.parametrizations.weight_norm(m)
 
         self.apply(_apply_weight_norm)
diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py
index 6a4d4ca6e7..71b38d4c0d 100644
--- a/TTS/vocoder/models/parallel_wavegan_generator.py
+++ b/TTS/vocoder/models/parallel_wavegan_generator.py
@@ -12,6 +12,13 @@
 logger = logging.getLogger(__name__)
 
 
+def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2**x):
+    assert layers % stacks == 0
+    layers_per_cycle = layers // stacks
+    dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
+    return (kernel_size - 1) * sum(dilations) + 1
+
+
 class ParallelWaveganGenerator(torch.nn.Module):
     """PWGAN generator as in https://arxiv.org/pdf/1910.11480.pdf.
     It is similar to WaveNet with no causal convolution.
@@ -101,9 +108,9 @@ def forward(self, c):
         # perform upsampling
         if c is not None and self.upsample_net is not None:
             c = self.upsample_net(c)
-            assert (
-                c.shape[-1] == x.shape[-1]
-            ), f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}"
+            assert c.shape[-1] == x.shape[-1], (
+                f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}"
+            )
 
         # encode to hidden representation
         x = self.first_conv(x)
@@ -120,7 +127,7 @@ def forward(self, c):
 
         return x
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, c):
         c = c.to(self.first_conv.weight.device)
         c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate")
@@ -138,26 +145,17 @@ def _remove_weight_norm(m):
 
     def apply_weight_norm(self):
         def _apply_weight_norm(m):
-            if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)):
+            if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d):
                 torch.nn.utils.parametrizations.weight_norm(m)
                 logger.info("Weight norm is applied to %s", m)
 
         self.apply(_apply_weight_norm)
 
-    @staticmethod
-    def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2**x):
-        assert layers % stacks == 0
-        layers_per_cycle = layers // stacks
-        dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
-        return (kernel_size - 1) * sum(dilations) + 1
-
     @property
     def receptive_field_size(self):
-        return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
+        return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py
index 72e57a9c39..d991941441 100644
--- a/TTS/vocoder/models/univnet_generator.py
+++ b/TTS/vocoder/models/univnet_generator.py
@@ -1,5 +1,4 @@
 import logging
-from typing import List
 
 import numpy as np
 import torch
@@ -7,6 +6,7 @@
 from torch.nn.utils import parametrize
 
 from TTS.vocoder.layers.lvc_block import LVCBlock
+from TTS.vocoder.models.parallel_wavegan_generator import _get_receptive_field_size
 
 logger = logging.getLogger(__name__)
 
@@ -20,7 +20,7 @@ def __init__(
         out_channels: int,
         hidden_channels: int,
         cond_channels: int,
-        upsample_factors: List[int],
+        upsample_factors: list[int],
         lvc_layers_each_block: int,
         lvc_kernel_size: int,
         kpnet_hidden_channels: int,
@@ -127,25 +127,18 @@ def apply_weight_norm(self):
         """Apply weight normalization module from all of the layers."""
 
         def _apply_weight_norm(m):
-            if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)):
+            if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d):
                 torch.nn.utils.parametrizations.weight_norm(m)
                 logger.info("Weight norm is applied to %s", m)
 
         self.apply(_apply_weight_norm)
 
-    @staticmethod
-    def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2**x):
-        assert layers % stacks == 0
-        layers_per_cycle = layers // stacks
-        dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
-        return (kernel_size - 1) * sum(dilations) + 1
-
     @property
     def receptive_field_size(self):
         """Return receptive field size."""
-        return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
+        return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, c):
         """Perform inference.
         Args:
diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py
index c49abd2201..5aa8ce5bb9 100644
--- a/TTS/vocoder/models/wavegrad.py
+++ b/TTS/vocoder/models/wavegrad.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import Dict, List, Tuple
 
 import numpy as np
 import torch
@@ -25,10 +24,10 @@ class WavegradArgs(Coqpit):
     use_weight_norm: bool = False
     y_conv_channels: int = 32
     x_conv_channels: int = 768
-    dblock_out_channels: List[int] = field(default_factory=lambda: [128, 128, 256, 512])
-    ublock_out_channels: List[int] = field(default_factory=lambda: [512, 512, 256, 128, 128])
-    upsample_factors: List[int] = field(default_factory=lambda: [4, 4, 4, 2, 2])
-    upsample_dilations: List[List[int]] = field(
+    dblock_out_channels: list[int] = field(default_factory=lambda: [128, 128, 256, 512])
+    ublock_out_channels: list[int] = field(default_factory=lambda: [512, 512, 256, 128, 128])
+    upsample_factors: list[int] = field(default_factory=lambda: [4, 4, 4, 2, 2])
+    upsample_dilations: list[list[int]] = field(
         default_factory=lambda: [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]]
     )
 
@@ -123,7 +122,7 @@ def load_noise_schedule(self, path):
         beta = np.load(path, allow_pickle=True).item()["beta"]  # pylint: disable=unexpected-keyword-arg
         self.compute_noise_level(beta)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, x, y_n=None):
         """
         Shapes:
@@ -218,9 +217,7 @@ def apply_weight_norm(self):
         self.out_conv = weight_norm(self.out_conv)
         self.y_conv = weight_norm(self.y_conv)
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
@@ -242,7 +239,7 @@ def load_checkpoint(
             )
             self.compute_noise_level(betas)
 
-    def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]:
+    def train_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]:
         # format data
         x = batch["input"]
         y = batch["waveform"]
@@ -258,20 +255,30 @@ def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]:
         return {"model_output": noise_hat}, {"loss": loss}
 
     def train_log(  # pylint: disable=no-self-use
-        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
-    ) -> Tuple[Dict, np.ndarray]:
+        self,
+        batch: dict,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
+    ) -> tuple[dict, np.ndarray]:
         pass
 
-    @torch.no_grad()
-    def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
+    @torch.inference_mode()
+    def eval_step(self, batch: dict, criterion: nn.Module) -> tuple[dict, dict]:
         return self.train_step(batch, criterion)
 
     def eval_log(  # pylint: disable=no-self-use
-        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
+        self,
+        batch: dict,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
     ) -> None:
         pass
 
-    def test(self, assets: Dict, test_loader: "DataLoader", outputs=None):  # pylint: disable=unused-argument
+    def test(self, assets: dict, test_loader: "DataLoader", outputs=None):  # pylint: disable=unused-argument
         # setup noise schedule and inference
         ap = assets["audio_processor"]
         noise_schedule = self.config["test_noise_schedule"]
@@ -302,13 +309,22 @@ def get_criterion():
         return torch.nn.L1Loss()
 
     @staticmethod
-    def format_batch(batch: Dict) -> Dict:
+    def format_batch(batch: dict) -> dict:
         # return a whole audio segment
         m, y = batch[0], batch[1]
         y = y.unsqueeze(1)
         return {"input": m, "waveform": y}
 
-    def get_data_loader(self, config: Coqpit, assets: Dict, is_eval: True, samples: List, verbose: bool, num_gpus: int):
+    def get_data_loader(
+        self,
+        config: Coqpit,
+        assets: dict,
+        is_eval: True,
+        samples: list,
+        verbose: bool,
+        num_gpus: int,
+        rank: int | None = None,
+    ):
         ap = assets["audio_processor"]
         dataset = WaveGradDataset(
             ap=ap,
diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py
index 723f18dde2..fb95d47589 100644
--- a/TTS/vocoder/models/wavernn.py
+++ b/TTS/vocoder/models/wavernn.py
@@ -1,7 +1,6 @@
 import sys
 import time
 from dataclasses import dataclass, field
-from typing import Dict, List, Tuple
 
 import numpy as np
 import torch
@@ -17,6 +16,7 @@
 from TTS.utils.audio.numpy_transforms import mulaw_decode
 from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
 from TTS.vocoder.layers.losses import WaveRNNLoss
+from TTS.vocoder.layers.upsample import Stretch2d
 from TTS.vocoder.models.base_vocoder import BaseVocoder
 from TTS.vocoder.utils.distribution import sample_from_discretized_mix_logistic, sample_from_gaussian
 
@@ -66,19 +66,6 @@ def forward(self, x):
         return x
 
 
-class Stretch2d(nn.Module):
-    def __init__(self, x_scale, y_scale):
-        super().__init__()
-        self.x_scale = x_scale
-        self.y_scale = y_scale
-
-    def forward(self, x):
-        b, c, h, w = x.size()
-        x = x.unsqueeze(-1).unsqueeze(3)
-        x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
-        return x.view(b, c, h * self.y_scale, w * self.x_scale)
-
-
 class UpsampleNetwork(nn.Module):
     def __init__(
         self,
@@ -183,7 +170,7 @@ class WavernnArgs(Coqpit):
     num_res_blocks: int = 10
     use_aux_net: bool = True
     use_upsample_net: bool = True
-    upsample_factors: List[int] = field(default_factory=lambda: [4, 8, 8])
+    upsample_factors: list[int] = field(default_factory=lambda: [4, 8, 8])
     mode: str = "mold"  # mold [string], gauss [string], bits [int]
     mulaw: bool = True  # apply mulaw if mode is bits
     pad: int = 2
@@ -238,9 +225,9 @@ class of models has however remained an elusive problem. With a focus on text-to
         self.aux_dims = self.args.res_out_dims // 4
 
         if self.args.use_upsample_net:
-            assert (
-                np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length
-            ), " [!] upsample scales needs to be equal to hop_length"
+            assert np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length, (
+                " [!] upsample scales needs to be equal to hop_length"
+            )
             self.upsample = UpsampleNetwork(
                 self.args.feat_dims,
                 self.args.upsample_factors,
@@ -319,7 +306,7 @@ def inference(self, mels, batched=None, target=None, overlap=None):
         rnn1 = self.get_gru_cell(self.rnn1)
         rnn2 = self.get_gru_cell(self.rnn2)
 
-        with torch.no_grad():
+        with torch.inference_mode():
             if isinstance(mels, np.ndarray):
                 mels = torch.FloatTensor(mels).to(str(next(self.parameters()).device))
 
@@ -540,16 +527,14 @@ def xfade_and_unfold(y, target, overlap):
 
         return unfolded
 
-    def load_checkpoint(
-        self, config, checkpoint_path, eval=False, cache=False
-    ):  # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False):  # pylint: disable=unused-argument, redefined-builtin
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
         self.load_state_dict(state["model"])
         if eval:
             self.eval()
             assert not self.training
 
-    def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]:
+    def train_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]:
         mels = batch["input"]
         waveform = batch["waveform"]
         waveform_coarse = batch["waveform_coarse"]
@@ -564,13 +549,16 @@ def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]:
         loss_dict = criterion(y_hat, waveform_coarse)
         return {"model_output": y_hat}, loss_dict
 
-    def eval_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]:
+    def eval_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]:
         return self.train_step(batch, criterion)
 
     @torch.no_grad()
     def test(
-        self, assets: Dict, test_loader: "DataLoader", output: Dict  # pylint: disable=unused-argument
-    ) -> Tuple[Dict, Dict]:
+        self,
+        assets: dict,
+        test_loader: "DataLoader",
+        output: dict,  # pylint: disable=unused-argument
+    ) -> tuple[dict, dict]:
         ap = self.ap
         figures = {}
         audios = {}
@@ -591,14 +579,18 @@ def test(
         return figures, audios
 
     def test_log(
-        self, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
-    ) -> Tuple[Dict, np.ndarray]:
+        self,
+        outputs: dict,
+        logger: "Logger",
+        assets: dict,
+        steps: int,  # pylint: disable=unused-argument
+    ) -> tuple[dict, np.ndarray]:
         figures, audios = outputs
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
     @staticmethod
-    def format_batch(batch: Dict) -> Dict:
+    def format_batch(batch: dict) -> dict:
         waveform = batch[0]
         mels = batch[1]
         waveform_coarse = batch[2]
@@ -607,11 +599,12 @@ def format_batch(batch: Dict) -> Dict:
     def get_data_loader(  # pylint: disable=no-self-use
         self,
         config: Coqpit,
-        assets: Dict,
+        assets: dict,
         is_eval: True,
-        samples: List,
+        samples: list,
         verbose: bool,
         num_gpus: int,
+        rank: int | None = None,
     ):
         ap = self.ap
         dataset = WaveRNNDataset(
diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py
index fe706ba9ff..bef68e5564 100644
--- a/TTS/vocoder/utils/distribution.py
+++ b/TTS/vocoder/utils/distribution.py
@@ -12,7 +12,7 @@ def gaussian_loss(y_hat, y, log_std_min=-7.0):
     mean = y_hat[:, :, :1]
     log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min)
     # TODO: replace with pytorch dist
-    log_probs = -0.5 * (-math.log(2.0 * math.pi) - 2.0 * log_std - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std)))
+    log_probs = -0.5 * (-math.log(2.0 * math.pi) - 2.0 * log_std - torch.pow(y - mean, 2) * torch.exp(-2.0 * log_std))
     return log_probs.squeeze().mean()
 
 
diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py
index ac797d97f7..2823d206a0 100644
--- a/TTS/vocoder/utils/generic_utils.py
+++ b/TTS/vocoder/utils/generic_utils.py
@@ -1,5 +1,4 @@
 import logging
-from typing import Dict
 
 import numpy as np
 import torch
@@ -32,7 +31,7 @@ def interpolate_vocoder_input(scale_factor, spec):
     return spec
 
 
-def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> Dict:
+def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> dict:
     """Plot the predicted and the real waveform and their spectrograms.
 
     Args:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e7d36c1f43..e878d0e8f9 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -52,6 +52,7 @@
     "sphinx_inline_tabs",
 ]
 
+suppress_warnings = ["autosectionlabel.*"]
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
@@ -67,6 +68,8 @@
     "linkify",
 ]
 
+myst_heading_anchors = 4
+
 # 'sphinxcontrib.katex',
 # 'sphinx.ext.autosectionlabel',
 
diff --git a/docs/source/configuration.md b/docs/source/configuration.md
index ada61e16db..220c96c363 100644
--- a/docs/source/configuration.md
+++ b/docs/source/configuration.md
@@ -1,6 +1,6 @@
 # Configuration
 
-We use 👩‍✈️[Coqpit] for configuration management. It provides basic static type checking and serialization capabilities on top of native Python `dataclasses`. Here is how a simple configuration looks like with Coqpit.
+We use 👩‍✈️[Coqpit](https://github.com/idiap/coqui-ai-coqpit) for configuration management. It provides basic static type checking and serialization capabilities on top of native Python `dataclasses`. Here is how a simple configuration looks like with Coqpit.
 
 ```python
 from dataclasses import asdict, dataclass, field
@@ -36,7 +36,7 @@ class SimpleConfig(Coqpit):
         check_argument("val_c", c, restricted=True)
 ```
 
-In TTS, each model must have a configuration class that exposes all the values necessary for its lifetime.
+In Coqui, each model must have a configuration class that exposes all the values necessary for its lifetime.
 
 It defines model architecture, hyper-parameters, training, and inference settings. For our models, we merge all the fields in a single configuration class for ease. It may not look like a wise practice but enables easier bookkeeping and reproducible experiments.
 
diff --git a/docs/source/formatting_your_dataset.md b/docs/source/datasets/formatting_your_dataset.md
similarity index 95%
rename from docs/source/formatting_your_dataset.md
rename to docs/source/datasets/formatting_your_dataset.md
index 23c497d0bf..e92263339e 100644
--- a/docs/source/formatting_your_dataset.md
+++ b/docs/source/datasets/formatting_your_dataset.md
@@ -1,7 +1,9 @@
 (formatting_your_dataset)=
-# Formatting Your Dataset
+# Formatting your dataset
 
-For training a TTS model, you need a dataset with speech recordings and transcriptions. The speech must be divided into audio clips and each clip needs transcription.
+For training a TTS model, you need a dataset with speech recordings and
+transcriptions. The speech must be divided into audio clips and each clip needs
+a transcription.
 
 If you have a single audio file and you need to split it into clips, there are different open-source tools for you. We recommend Audacity. It is an open-source and free audio editing software.
 
@@ -49,7 +51,7 @@ The format above is taken from widely-used the [LJSpeech](https://keithito.com/L
 
 Your dataset should have good coverage of the target language. It should cover the phonemic variety, exceptional sounds and syllables. This is extremely important for especially non-phonemic languages like English.
 
-For more info about dataset qualities and properties check our [post](https://github.com/coqui-ai/TTS/wiki/What-makes-a-good-TTS-dataset).
+For more info about dataset qualities and properties check [this page](what_makes_a_good_dataset.md).
 
 ## Using Your Dataset in 🐸TTS
 
diff --git a/docs/source/datasets/index.md b/docs/source/datasets/index.md
new file mode 100644
index 0000000000..6b040fc416
--- /dev/null
+++ b/docs/source/datasets/index.md
@@ -0,0 +1,12 @@
+# Datasets
+
+For training a TTS model, you need a dataset with speech recordings and
+transcriptions. See the following pages for more information on:
+
+```{toctree}
+:maxdepth: 1
+
+formatting_your_dataset
+what_makes_a_good_dataset
+tts_datasets
+```
diff --git a/docs/source/tts_datasets.md b/docs/source/datasets/tts_datasets.md
similarity index 90%
rename from docs/source/tts_datasets.md
rename to docs/source/datasets/tts_datasets.md
index 11da1b7688..df8d2f2ad9 100644
--- a/docs/source/tts_datasets.md
+++ b/docs/source/datasets/tts_datasets.md
@@ -1,6 +1,6 @@
-# TTS Datasets
+# Public TTS datasets
 
-Some of the known public datasets that we successfully applied 🐸TTS:
+Some of the known public datasets that were successfully used for 🐸TTS:
 
 - [English - LJ Speech](https://keithito.com/LJ-Speech-Dataset/)
 - [English - Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/)
diff --git a/docs/source/what_makes_a_good_dataset.md b/docs/source/datasets/what_makes_a_good_dataset.md
similarity index 100%
rename from docs/source/what_makes_a_good_dataset.md
rename to docs/source/datasets/what_makes_a_good_dataset.md
diff --git a/docs/source/docker_images.md b/docs/source/docker_images.md
index 58d961203e..ef98fe302e 100644
--- a/docs/source/docker_images.md
+++ b/docs/source/docker_images.md
@@ -1,20 +1,20 @@
 (docker_images)=
-## Docker images
+# Docker images
 We provide docker images to be able to test TTS without having to setup your own environment.
 
-### Using premade images
+## Using premade images
 You can use premade images built automatically from the latest TTS version.
 
-#### CPU version
+### CPU version
 ```bash
-docker pull ghcr.io/coqui-ai/tts-cpu
+docker pull ghcr.io/idiap/coqui-tts-cpu
 ```
-#### GPU version
+### GPU version
 ```bash
-docker pull ghcr.io/coqui-ai/tts
+docker pull ghcr.io/idiap/coqui-tts
 ```
 
-### Building your own image
+## Building your own image
 ```bash
 docker build -t tts .
 ```
@@ -25,14 +25,14 @@ You can pass any tts argument after the image name.
 
 ### CPU version
 ```bash
-docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav
+docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/idiap/coqui-tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav
 ```
 ### GPU version
 For the GPU version, you need to have the latest NVIDIA drivers installed.
 With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8
 
 ```bash
-docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda
+docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/idiap/coqui-tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda
 ```
 
 ## Start a server
@@ -41,14 +41,14 @@ Start the container and get a shell inside it.
 
 ### CPU version
 ```bash
-docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
+docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/idiap/coqui-tts-cpu
 python3 TTS/server/server.py --list_models #To get the list of available models
 python3 TTS/server/server.py --model_name tts_models/en/vctk/vits
 ```
 
 ### GPU version
 ```bash
-docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts
+docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/idiap/coqui-tts
 python3 TTS/server/server.py --list_models #To get the list of available models
 python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda
 ```
diff --git a/docs/source/implementing_a_new_language_frontend.md b/docs/source/extension/implementing_a_new_language_frontend.md
similarity index 88%
rename from docs/source/implementing_a_new_language_frontend.md
rename to docs/source/extension/implementing_a_new_language_frontend.md
index 2041352d64..0b3ef59be0 100644
--- a/docs/source/implementing_a_new_language_frontend.md
+++ b/docs/source/extension/implementing_a_new_language_frontend.md
@@ -1,6 +1,6 @@
-# Implementing a New Language Frontend
+# Implementing new language front ends
 
-- Language frontends are located under `TTS.tts.utils.text`
+- Language front ends are located under `TTS.tts.utils.text`
 - Each special language has a separate folder.
 - Each folder contains all the utilities for processing the text input.
 - `TTS.tts.utils.text.phonemizers` contains the main phonemizer for a language. This is the class that uses the utilities
diff --git a/docs/source/implementing_a_new_model.md b/docs/source/extension/implementing_a_new_model.md
similarity index 96%
rename from docs/source/implementing_a_new_model.md
rename to docs/source/extension/implementing_a_new_model.md
index 1bf7a8822e..188f466c72 100644
--- a/docs/source/implementing_a_new_model.md
+++ b/docs/source/extension/implementing_a_new_model.md
@@ -1,4 +1,4 @@
-# Implementing a Model
+# Implementing new models
 
 1. Implement layers.
 
@@ -36,7 +36,8 @@
     There is also the `callback` interface by which you can manipulate both the model and the `Trainer` states. Callbacks give you
     an infinite flexibility to add custom behaviours for your model and training routines.
 
-    For more details, see {ref}`BaseTTS <Base tts Model>` and :obj:`TTS.utils.callbacks`.
+    For more details, see [BaseTTS](../main_classes/model_api.md#base-tts-model)
+    and [`trainer.callbacks`](https://github.com/idiap/coqui-ai-Trainer/blob/main/trainer/callbacks.py).
 
 6. Optionally, define `MyModelArgs`.
 
@@ -62,7 +63,7 @@
     We love you more when you document your code. ❤️
 
 
-# Template 🐸TTS Model implementation
+## Template 🐸TTS Model implementation
 
 You can start implementing your model by copying the following base class.
 
diff --git a/docs/source/extension/index.md b/docs/source/extension/index.md
new file mode 100644
index 0000000000..39c36b632c
--- /dev/null
+++ b/docs/source/extension/index.md
@@ -0,0 +1,14 @@
+# Adding models or languages
+
+You can extend Coqui by implementing new model architectures or adding front
+ends for new languages. See the pages below for more details. The [project
+structure](../project_structure.md) and [contribution
+guidelines](../contributing.md) may also be helpful. Please open a pull request
+with your changes to share back the improvements with the community.
+
+```{toctree}
+:maxdepth: 1
+
+implementing_a_new_model
+implementing_a_new_language_frontend
+```
diff --git a/docs/source/faq.md b/docs/source/faq.md
index 1090aaa35c..4fbd149f00 100644
--- a/docs/source/faq.md
+++ b/docs/source/faq.md
@@ -1,28 +1,56 @@
-# Humble FAQ
-We tried to collect common issues and questions we receive about 🐸TTS. It is worth checking before going deeper.
+# FAQ
+We tried to collect common issues and questions we receive about 🐸TTS. It is
+worth checking before going deeper.
 
-## Errors with a pre-trained model. How can I resolve this?
-- Make sure you use the right commit version of 🐸TTS. Each pre-trained model has its corresponding version that needs to be used. It is defined on the model table.
-- If it is still problematic, post your problem on [Discussions](https://github.com/idiap/coqui-ai-TTS/discussions). Please give as many details as possible (error message, your TTS version, your TTS model and config.json etc.)
-- If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny.
+## Using Coqui
 
-## What are the requirements of a good 🐸TTS dataset?
-* {ref}`See this page <what_makes_a_good_dataset>`
+### Where does Coqui store downloaded models?
 
-## How should I choose the right model?
+The path to downloaded models is printed when running `tts --list_models`.
+Default locations are:
+
+- **Linux:** `~/.local/share/tts`
+- **Mac:** `~/Library/Application Support/tts`
+- **Windows:** `C:\Users\<user>\AppData\Local\tts`
+
+You can change the prefix of this `tts/` folder by setting the `XDG_DATA_HOME`
+or `TTS_HOME` environment variables.
+
+### Errors with a pre-trained model. How can I resolve this?
+- Make sure you use the latest version of 🐸TTS. Each pre-trained model is only
+  supported from a certain minimum version.
+- If it is still problematic, post your problem on
+  [Discussions](https://github.com/idiap/coqui-ai-TTS/discussions). Please give
+  as many details as possible (error message, your TTS version, your TTS model
+  and config.json etc.)
+- If you feel like it's a bug to be fixed, then prefer Github issues with the
+  same level of scrutiny.
+
+## Training Coqui models
+
+### What are the requirements of a good 🐸TTS dataset?
+- [See this page](datasets/what_makes_a_good_dataset.md)
+
+### How should I choose the right model?
 - First, train Tacotron. It is smaller and faster to experiment with. If it performs poorly, try Tacotron2.
 - Tacotron models produce the most natural voice if your dataset is not too noisy.
 - If both models do not perform well and especially the attention does not align, then try AlignTTS or GlowTTS.
 - If you need faster models, consider SpeedySpeech, GlowTTS or AlignTTS. Keep in mind that SpeedySpeech requires a pre-trained Tacotron or Tacotron2 model to compute text-to-speech alignments.
 
-## How can I train my own `tts` model?
+### How can I train my own `tts` model?
+
+```{note} XTTS has separate fine-tuning scripts, see [here](models/xtts.md#training).
+```
+
 0. Check your dataset with notebooks in [dataset_analysis](https://github.com/idiap/coqui-ai-TTS/tree/main/notebooks/dataset_analysis) folder. Use [this notebook](https://github.com/idiap/coqui-ai-TTS/blob/main/notebooks/dataset_analysis/CheckSpectrograms.ipynb) to find the right audio processing parameters. A better set of parameters results in a better audio synthesis.
 
-1. Write your own dataset `formatter` in `datasets/formatters.py` or format your dataset as one of the supported datasets, like LJSpeech.
+1. Write your own dataset `formatter` in `datasets/formatters.py` or [format](datasets/formatting_your_dataset) your dataset as one of the supported datasets, like LJSpeech.
     A `formatter` parses the metadata file and converts a list of training samples.
 
 2. If you have a dataset with a different alphabet than English, you need to set your own character list in the ```config.json```.
-    - If you use phonemes for training and your language is supported [here](https://github.com/rhasspy/gruut#supported-languages), you don't need to set your character list.
+    - If you use phonemes for training and your language is supported by
+    [Espeak](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)
+    or [Gruut](https://github.com/rhasspy/gruut#supported-languages), you don't need to set your character list.
     - You can use `TTS/bin/find_unique_chars.py` to get characters used in your dataset.
 
 3. Write your own text cleaner in ```utils.text.cleaners```. It is not always necessary, except when you have a different alphabet or language-specific requirements.
@@ -61,15 +89,16 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
     - SingleGPU training: ```CUDA_VISIBLE_DEVICES="0" python train_tts.py --config_path config.json```
     - MultiGPU training: ```python3 -m trainer.distribute --gpus "0,1" --script TTS/bin/train_tts.py --config_path config.json```
 
-**Note:** You can also train your model using pure 🐍 python. Check ```{eval-rst} :ref: 'tutorial_for_nervous_beginners'```.
+**Note:** You can also train your model using pure 🐍 python. Check the
+[tutorial](tutorial_for_nervous_beginners.md).
 
-## How can I train in a different language?
+### How can I train in a different language?
 - Check steps 2, 3, 4, 5 above.
 
-## How can I train multi-GPUs?
+### How can I train multi-GPUs?
 - Check step 5 above.
 
-## How can I check model performance?
+### How can I check model performance?
 - You can inspect model training and performance using ```tensorboard```. It will show you loss, attention alignment, model output. Go with the order below to measure the model performance.
 1. Check ground truth spectrograms. If they do not look as they are supposed to, then check audio processing parameters in ```config.json```.
 2. Check train and eval losses and make sure that they all decrease smoothly in time.
@@ -84,7 +113,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
         - 'bidirectional_decoder' is your ultimate savior, but it trains 2x slower and demands 1.5x more GPU memory.
     - You can also try the other models like AlignTTS or GlowTTS.
 
-## How do I know when to stop training?
+### How do I know when to stop training?
 There is no single objective metric to decide the end of a training since the voice quality is a subjective matter.
 
 In our model trainings, we follow these steps;
@@ -97,17 +126,17 @@ In our model trainings, we follow these steps;
 Keep in mind that the approach above only validates the model robustness. It is hard to estimate the voice quality without asking the actual people.
 The best approach is to pick a set of promising models and run a Mean-Opinion-Score study asking actual people to score the models.
 
-## My model does not learn. How can I debug?
+### My model does not learn. How can I debug?
 - Go over the steps under "How can I check model performance?"
 
-## Attention does not align. How can I make it work?
+### Attention does not align. How can I make it work?
 - Check the 4th step under "How can I check model performance?"
 
-## How can I test a trained model?
-- The best way is to use `tts` or `tts-server` commands. For details check {ref}`here <synthesizing_speech>`.
+### How can I test a trained model?
+- The best way is to use `tts` or `tts-server` commands. For details check [here](inference.md).
 - If you need to code your own ```TTS.utils.synthesizer.Synthesizer``` class.
 
-## My Tacotron model does not stop - I see "Decoder stopped with 'max_decoder_steps" - Stopnet does not work.
+### My Tacotron model does not stop - I see "Decoder stopped with 'max_decoder_steps" - Stopnet does not work.
 - In general, all of the above relates to the `stopnet`. It is the part of the model telling the `decoder` when to stop.
 - In general, a poor `stopnet` relates to something else that is broken in your model or dataset. Especially the attention module.
 - One common reason is the silent parts in the audio clips at the beginning and the ending. Check ```trim_db``` value in the config. You can find a better value for your dataset by using ```CheckSpectrogram``` notebook. If this value is too small, too much of the audio will be trimmed. If too big, then too much silence will remain. Both will curtail the `stopnet` performance.
diff --git a/docs/source/index.md b/docs/source/index.md
index 79993eec76..3a030b4f81 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -1,62 +1,63 @@
+---
+hide-toc: true
+---
 
 ```{include} ../../README.md
 :relative-images:
+:end-before: <!-- start installation -->
 ```
-----
-
-# Documentation Content
-```{eval-rst}
-.. toctree::
-    :maxdepth: 2
-    :caption: Get started
-
-    tutorial_for_nervous_beginners
-    installation
-    faq
-    contributing
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Using 🐸TTS
-
-    inference
-    docker_images
-    implementing_a_new_model
-    implementing_a_new_language_frontend
-    training_a_model
-    finetuning
-    configuration
-    formatting_your_dataset
-    what_makes_a_good_dataset
-    tts_datasets
-    marytts
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Main Classes
-
-    main_classes/trainer_api
-    main_classes/audio_processor
-    main_classes/model_api
-    main_classes/dataset
-    main_classes/gan
-    main_classes/speaker_manager
-
-.. toctree::
-    :maxdepth: 2
-    :caption: `tts` Models
-
-    models/glow_tts.md
-    models/vits.md
-    models/forward_tts.md
-    models/tacotron1-2.md
-    models/overflow.md
-    models/tortoise.md
-    models/bark.md
-    models/xtts.md
-
-.. toctree::
-    :maxdepth: 2
-    :caption: `vocoder` Models
 
+```{toctree}
+:maxdepth: 1
+:caption: Get started
+:hidden:
+
+tutorial_for_nervous_beginners
+installation
+docker_images
+faq
+project_structure
+contributing
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Using Coqui
+:hidden:
+
+inference
+training/index
+extension/index
+datasets/index
+```
+
+
+```{toctree}
+:maxdepth: 1
+:caption: Main Classes
+:hidden:
+
+configuration
+main_classes/trainer_api
+main_classes/audio_processor
+main_classes/model_api
+main_classes/dataset
+main_classes/gan
+main_classes/speaker_manager
+```
+
+
+```{toctree}
+:maxdepth: 1
+:caption: TTS Models
+:hidden:
+
+models/glow_tts.md
+models/vits.md
+models/forward_tts.md
+models/tacotron1-2.md
+models/overflow.md
+models/tortoise.md
+models/bark.md
+models/xtts.md
 ```
diff --git a/docs/source/inference.md b/docs/source/inference.md
index 4cb8f45a71..1bb844aee3 100644
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@@ -1,194 +1,22 @@
 (synthesizing_speech)=
-# Synthesizing Speech
+# Synthesizing speech
 
-First, you need to install TTS. We recommend using PyPi. You need to call the command below:
+## Overview
 
-```bash
-$ pip install coqui-tts
-```
-
-After the installation, 2 terminal commands are available.
-
-1. TTS Command Line Interface (CLI). - `tts`
-2. Local Demo Server. - `tts-server`
-3. In 🐍Python. - `from TTS.api import TTS`
-
-## On the Commandline - `tts`
-![cli.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/tts_cli.gif)
-
-After the installation, 🐸TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under 🐸TTS.
-
-Listing released 🐸TTS models.
-
-```bash
-tts --list_models
-```
-
-Run a TTS model, from the release models list, with its default vocoder. (Simply copy and paste the full model names from the list as arguments for the command below.)
-
-```bash
-tts --text "Text for TTS" \
-    --model_name "<type>/<language>/<dataset>/<model_name>" \
-    --out_path folder/to/save/output.wav
-```
-
-Run a tts and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model.
-
-```bash
-tts --text "Text for TTS" \
-    --model_name "tts_models/<language>/<dataset>/<model_name>" \
-    --vocoder_name "vocoder_models/<language>/<dataset>/<model_name>" \
-    --out_path folder/to/save/output.wav
-```
-
-Run your own TTS model (Using Griffin-Lim Vocoder)
-
-```bash
-tts --text "Text for TTS" \
-    --model_path path/to/model.pth \
-    --config_path path/to/config.json \
-    --out_path folder/to/save/output.wav
-```
-
-Run your own TTS and Vocoder models
-
-```bash
-tts --text "Text for TTS" \
-    --config_path path/to/config.json \
-    --model_path path/to/model.pth \
-    --out_path folder/to/save/output.wav \
-    --vocoder_path path/to/vocoder.pth \
-    --vocoder_config_path path/to/vocoder_config.json
-```
-
-Run a multi-speaker TTS model from the released models list.
-
-```bash
-tts --model_name "tts_models/<language>/<dataset>/<model_name>"  --list_speaker_idxs  # list the possible speaker IDs.
-tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "tts_models/<language>/<dataset>/<model_name>"  --speaker_idx "<speaker_id>"
-```
-
-Run a released voice conversion model
-
-```bash
-tts --model_name "voice_conversion/<language>/<dataset>/<model_name>"
-    --source_wav "my/source/speaker/audio.wav"
-    --target_wav "my/target/speaker/audio.wav"
-    --out_path folder/to/save/output.wav
-```
-
-**Note:** You can use ```./TTS/bin/synthesize.py``` if you prefer running ```tts``` from the TTS project folder.
-
-## On the Demo Server - `tts-server`
-
- <!-- <img src="https://raw.githubusercontent.com/idiap/coqui-ai-TTS/main/images/demo_server.gif" height="56"/> -->
-![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif)
-
-You can boot up a demo 🐸TTS server to run an inference with your models (make
-sure to install the additional dependencies with `pip install coqui-tts[server]`).
-Note that the server is not optimized for performance but gives you an easy way
-to interact with the models.
+Coqui TTS provides three main methods for inference:
 
-The demo server provides pretty much the same interface as the CLI command.
+1. 🐍Python API
+2. TTS command line interface (CLI)
+3. [Local demo server](server.md)
 
-```bash
-tts-server -h # see the help
-tts-server --list_models  # list the available models.
+```{include} ../../README.md
+:start-after: <!-- start inference -->
 ```
 
-Run a TTS model, from the release models list, with its default vocoder.
-If the model you choose is a multi-speaker TTS model, you can select different speakers on the Web interface and synthesize
-speech.
-
-```bash
-tts-server --model_name "<type>/<language>/<dataset>/<model_name>"
-```
-
-Run a TTS and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model.
-
-```bash
-tts-server --model_name "<type>/<language>/<dataset>/<model_name>" \
-           --vocoder_name "<type>/<language>/<dataset>/<model_name>"
-```
-
-## Python 🐸TTS API
-
-You can run a multi-speaker and multi-lingual model in Python as
-
-```python
-import torch
-from TTS.api import TTS
-
-# Get device
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-# List available 🐸TTS models
-print(TTS().list_models())
-
-# Init TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
-
-# Run TTS
-# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
-# Text to speech list of amplitude values as output
-wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
-# Text to speech to a file
-tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
-```
-
-#### Here is an example for a single speaker model.
-
-```python
-# Init TTS with the target model name
-tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False)
-# Run TTS
-tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
-```
-
-#### Example voice cloning with YourTTS in English, French and Portuguese:
-
-```python
-tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to("cuda")
-tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
-tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="output.wav")
-tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="output.wav")
-```
-
-#### Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
-
-```python
-tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda")
-tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
-```
-
-#### Example voice cloning by a single speaker TTS model combining with the voice conversion model.
-
-This way, you can clone voices by using any model in 🐸TTS.
-
-```python
-tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
-tts.tts_with_vc_to_file(
-    "Wie sage ich auf Italienisch, dass ich dich liebe?",
-    speaker_wav="target/speaker.wav",
-    file_path="ouptut.wav"
-)
-```
-
-#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
-For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
-
-You can find the list of language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
-
-```python
-from TTS.api import TTS
-api = TTS(model_name="tts_models/eng/fairseq/vits").to("cuda")
-api.tts_to_file("This is a test.", file_path="output.wav")
 
-# TTS with on the fly voice conversion
-api = TTS("tts_models/deu/fairseq/vits")
-api.tts_with_vc_to_file(
-    "Wie sage ich auf Italienisch, dass ich dich liebe?",
-    speaker_wav="target/speaker.wav",
-    file_path="ouptut.wav"
-)
+```{toctree}
+:hidden:
+vc
+server
+marytts
 ```
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 405c436643..1315395a59 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -1,40 +1,6 @@
 # Installation
 
-🐸TTS supports python >=3.9 <3.13.0 and was tested on Ubuntu 22.04.
-
-## Using `pip`
-
-`pip` is recommended if you want to use 🐸TTS only for inference.
-
-You can install from PyPI as follows:
-
-```bash
-pip install coqui-tts  # from PyPI
-```
-
-Or install from Github:
-
-```bash
-pip install git+https://github.com/idiap/coqui-ai-TTS  # from Github
+```{include} ../../README.md
+:start-after: <!-- start installation -->
+:end-before: <!-- end installation -->
 ```
-
-## Installing From Source
-
-This is recommended for development and more control over 🐸TTS.
-
-```bash
-git clone https://github.com/idiap/coqui-ai-TTS
-cd coqui-ai-TTS
-make system-deps  # only on Linux systems.
-
-# Install package and optional extras
-make install
-
-# Same as above + dev dependencies and pre-commit
-make install_dev
-```
-
-## On Windows
-If you are on Windows, 👑@GuyPaddock wrote installation instructions
-[here](https://stackoverflow.com/questions/66726331/) (note that these are out
-of date, e.g. you need to have at least Python 3.9)
diff --git a/docs/source/main_classes/model_api.md b/docs/source/main_classes/model_api.md
index 71b3d41640..bb7e9d1a1d 100644
--- a/docs/source/main_classes/model_api.md
+++ b/docs/source/main_classes/model_api.md
@@ -1,22 +1,22 @@
 # Model API
 Model API provides you a set of functions that easily make your model compatible with the `Trainer`,
-`Synthesizer` and `ModelZoo`.
+`Synthesizer` and the Coqui Python API.
 
-## Base TTS Model
+## Base Trainer Model
 
 ```{eval-rst}
 .. autoclass:: TTS.model.BaseTrainerModel
     :members:
 ```
 
-## Base tts Model
+## Base TTS Model
 
 ```{eval-rst}
 .. autoclass:: TTS.tts.models.base_tts.BaseTTS
     :members:
 ```
 
-## Base vocoder Model
+## Base Vocoder Model
 
 ```{eval-rst}
 .. autoclass:: TTS.vocoder.models.base_vocoder.BaseVocoder
diff --git a/docs/source/main_classes/trainer_api.md b/docs/source/main_classes/trainer_api.md
index 335294aa4d..bdb6048e45 100644
--- a/docs/source/main_classes/trainer_api.md
+++ b/docs/source/main_classes/trainer_api.md
@@ -1,3 +1,3 @@
 # Trainer API
 
-We made the trainer a separate project on https://github.com/eginhard/coqui-trainer
+We made the trainer a separate project: https://github.com/idiap/coqui-ai-Trainer
diff --git a/docs/source/marytts.md b/docs/source/marytts.md
index 9091ca330f..11cf4a2b9a 100644
--- a/docs/source/marytts.md
+++ b/docs/source/marytts.md
@@ -1,4 +1,4 @@
-# Mary-TTS API Support for Coqui-TTS
+# Mary-TTS API support for Coqui TTS
 
 ## What is Mary-TTS?
 
diff --git a/docs/source/models/bark.md b/docs/source/models/bark.md
index a180afbb91..77f99c0d3a 100644
--- a/docs/source/models/bark.md
+++ b/docs/source/models/bark.md
@@ -37,7 +37,7 @@ from TTS.api import TTS
 
 # Load the model to GPU
 # Bark is really slow on CPU, so we recommend using GPU.
-tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
+tts = TTS("tts_models/multilingual/multi-dataset/bark").to("cuda")
 
 
 # Cloning a new speaker
@@ -57,7 +57,7 @@ tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
 
 
 # random speaker
-tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
+tts = TTS("tts_models/multilingual/multi-dataset/bark").to("cuda")
 tts.tts_to_file("hello world", file_path="out.wav")
 ```
 
diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md
index c07d879f7c..5f6c6ba44c 100644
--- a/docs/source/models/xtts.md
+++ b/docs/source/models/xtts.md
@@ -1,25 +1,25 @@
-# ⓍTTS
-ⓍTTS is a super cool Text-to-Speech model that lets you clone voices in different languages by using just a quick 3-second audio clip. Built on the 🐢Tortoise,
-ⓍTTS has important model changes that make cross-language voice cloning and multi-lingual speech generation super easy.
+# XTTS
+XTTS is a super cool Text-to-Speech model that lets you clone voices in different languages by using just a quick 3-second audio clip. Built on the 🐢Tortoise,
+XTTS has important model changes that make cross-language voice cloning and multi-lingual speech generation super easy.
 There is no need for an excessive amount of training data that spans countless hours.
 
-### Features
+## Features
 - Voice cloning.
 - Cross-language voice cloning.
 - Multi-lingual speech generation.
 - 24khz sampling rate.
-- Streaming inference with < 200ms latency. (See [Streaming inference](#streaming-inference))
+- Streaming inference with < 200ms latency. (See [Streaming inference](#streaming-manually))
 - Fine-tuning support. (See [Training](#training))
 
-### Updates with v2
+## Updates with v2
 - Improved voice cloning.
 - Voices can be cloned with a single audio file or multiple audio files, without any effect on the runtime.
 - Across the board quality improvements.
 
-### Code
+## Code
 Current implementation only supports inference and GPT encoder training.
 
-### Languages
+## Languages
 XTTS-v2 supports 17 languages:
 
 - Arabic (ar)
@@ -40,15 +40,15 @@ XTTS-v2 supports 17 languages:
 - Spanish (es)
 - Turkish (tr)
 
-### License
+## License
 This model is licensed under [Coqui Public Model License](https://coqui.ai/cpml).
 
-### Contact
+## Contact
 Come and join in our 🐸Community. We're active on [Discord](https://discord.gg/fBC58unbKE) and [Github](https://github.com/idiap/coqui-ai-TTS/discussions).
 
-### Inference
+## Inference
 
-#### 🐸TTS Command line
+### 🐸TTS Command line
 
 You can check all supported languages with the following command:
 
@@ -64,7 +64,7 @@ You can check all Coqui available speakers with the following command:
     --list_speaker_idx
 ```
 
-##### Coqui speakers
+#### Coqui speakers
 You can do inference using one of the available speakers using the following command:
 
 ```console
@@ -75,10 +75,10 @@ You can do inference using one of the available speakers using the following com
      --use_cuda
 ```
 
-##### Clone a voice
+#### Clone a voice
 You can clone a speaker voice using a single or multiple references:
 
-###### Single reference
+##### Single reference
 
 ```console
  tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
@@ -88,7 +88,7 @@ You can clone a speaker voice using a single or multiple references:
      --use_cuda
 ```
 
-###### Multiple references
+##### Multiple references
 ```console
  tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
      --text "Bugün okula gitmek istemiyorum." \
@@ -106,19 +106,19 @@ or for all wav files in a directory you can use:
      --use_cuda
 ```
 
-#### 🐸TTS API
+### 🐸TTS API
 
-##### Clone a voice
+#### Clone a voice
 You can clone a speaker voice using a single or multiple references:
 
-###### Single reference
+##### Single reference
 
 Splits the text into sentences and generates audio for each sentence. The audio files are then concatenated to produce the final audio.
 You can optionally disable sentence splitting for better coherence but more VRAM and possibly hitting models context length limit.
 
 ```python
 from TTS.api import TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
 
 # generate speech by cloning a voice using default settings
 tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
@@ -129,7 +129,7 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
                 )
 ```
 
-###### Multiple references
+##### Multiple references
 
 You can pass multiple audio files to the `speaker_wav` argument for better voice cloning.
 
@@ -137,15 +137,15 @@ You can pass multiple audio files to the `speaker_wav` argument for better voice
 from TTS.api import TTS
 
 # using the default version set in 🐸TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
 
 # using a specific version
 # 👀 see the branch names for versions on https://huggingface.co/coqui/XTTS-v2/tree/main
 # ❗some versions might be incompatible with the API
-tts = TTS("xtts_v2.0.2", gpu=True)
+tts = TTS("xtts_v2.0.2").to("cuda")
 
 # getting the latest XTTS_v2
-tts = TTS("xtts", gpu=True)
+tts = TTS("xtts").to("cuda")
 
 # generate speech by cloning a voice using default settings
 tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
@@ -154,37 +154,38 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
                 language="en")
 ```
 
-##### Coqui speakers
+#### Coqui speakers
 
 You can do inference using one of the available speakers using the following code:
 
 ```python
 from TTS.api import TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
 
 # generate speech by cloning a voice using default settings
-tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
-                file_path="output.wav",
-                speaker="Ana Florence",
-                language="en",
-                split_sentences=True
-                )
+tts.tts_to_file(
+  text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+  file_path="output.wav",
+  speaker="Ana Florence",
+  language="en",
+  split_sentences=True
+)
 ```
 
 
-#### 🐸TTS Model API
+### 🐸TTS Model API
 
 To use the model API, you need to download the model files and pass config and model file paths manually.
 
-#### Manual Inference
+### Manual Inference
 
 If you want to be able to `load_checkpoint` with `use_deepspeed=True` and **enjoy the speedup**, you need to install deepspeed first.
 
 ```console
-pip install deepspeed==0.10.3
+pip install deepspeed
 ```
 
-##### inference parameters
+#### Inference parameters
 
 - `text`: The text to be synthesized.
 - `language`: The language of the text to be synthesized.
@@ -199,7 +200,7 @@ pip install deepspeed==0.10.3
 - `enable_text_splitting`: Whether to split the text into sentences and generate audio for each sentence. It allows you to have infinite input length but might loose important context between sentences. Defaults to True.
 
 
-##### Inference
+#### Inference
 
 
 ```python
@@ -230,8 +231,13 @@ out = model.inference(
 torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
 ```
 
+You can also use the Coqui speakers:
+
+```python
+gpt_cond_latent, speaker_embedding = model.speaker_manager.speakers["Ana Florence"].values()
+```
 
-##### Streaming manually
+#### Streaming manually
 
 Here the goal is to stream the audio as it is being generated. This is useful for real-time applications.
 Streaming inference is typically slower than regular inference, but it allows to get a first chunk of audio faster.
@@ -275,9 +281,9 @@ torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
 ```
 
 
-### Training
+## Training
 
-#### Easy training
+### Easy training
 To make `XTTS_v2` GPT encoder training easier for beginner users we did a gradio demo that implements the whole fine-tuning pipeline. The gradio demo enables the user to easily do the following steps:
 
 - Preprocessing of the uploaded audio or audio files in 🐸 TTS coqui formatter
@@ -286,7 +292,7 @@ To make `XTTS_v2` GPT encoder training easier for beginner users we did a gradio
 
 The user can run this gradio demo locally or remotely using a Colab Notebook.
 
-##### Run demo on Colab
+#### Run demo on Colab
 To make the `XTTS_v2` fine-tuning more accessible for users that do not have good GPUs available we did a Google Colab Notebook.
 
 The Colab Notebook is available [here](https://colab.research.google.com/drive/1GiI4_X724M8q2W-zZ-jXo7cWTV7RfaH-?usp=sharing).
@@ -302,7 +308,7 @@ If you are not able to acess the video you need to follow the steps:
 5. Soon the training is done you can go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. Then you can do the inference on the model by clicking on the button "Step 4 - Inference".
 
 
-##### Run demo locally
+#### Run demo locally
 
 To run the demo locally you need to do the following steps:
 1. Install   🐸 TTS following the instructions available [here](https://coqui-tts.readthedocs.io/en/latest/installation.html).
@@ -319,7 +325,7 @@ If you are not able to access the video, here is what you need to do:
 4. Go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded.
 5. Now you can run inference with the model by clicking on the button "Step 4 - Inference".
 
-#### Advanced training
+### Advanced training
 
 A recipe for `XTTS_v2` GPT encoder training using `LJSpeech` dataset is available at https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech/xtts_v1/train_gpt_xtts.py
 
@@ -393,6 +399,6 @@ torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000)
 
 ## XTTS Model
 ```{eval-rst}
-.. autoclass:: TTS.tts.models.xtts.XTTS
+.. autoclass:: TTS.tts.models.xtts.Xtts
     :members:
 ```
diff --git a/docs/source/project_structure.md b/docs/source/project_structure.md
new file mode 100644
index 0000000000..af3e472adc
--- /dev/null
+++ b/docs/source/project_structure.md
@@ -0,0 +1,30 @@
+# Project structure
+
+## Directory structure
+
+A non-comprehensive overview of the Coqui source code:
+
+| Directory | Contents |
+| - | - |
+| **Core** | |
+| **[`TTS/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS)** | Main source code |
+| **[`-   .models.json`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/.models.json)** | Pretrained model list |
+| **[`-   api.py`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/api.py)** | Python API |
+| **[`-   bin/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/bin)** | Executables and CLI |
+| **[`-   tts/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/tts)** | Text-to-speech models |
+| **[`-       configs/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/tts/configs)** | Model configurations |
+| **[`-       layers/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/tts/layers)** | Model layer definitions |
+| **[`-       models/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/tts/models)** | Model definitions |
+| **[`-   vc/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/vc)** | Voice conversion models |
+| `-       (same)` | |
+| **[`-   vocoder/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/vocoder)** | Vocoder models |
+| `-       (same)` | |
+| **[`-   encoder/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/encoder)** | Speaker encoder models |
+| `-       (same)` | |
+| **Recipes/notebooks** | |
+| **[`notebooks/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/notebooks)** | Jupyter Notebooks for model evaluation, parameter selection and data analysis |
+| **[`recipes/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes)** | Training recipes |
+| **Others** | |
+| **[`pyproject.toml`](https://github.com/idiap/coqui-ai-TTS/tree/dev/pyproject.toml)** | Project metadata, configuration and dependencies |
+| **[`docs/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/docs)** | Documentation |
+| **[`tests/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/tests)** | Unit and integration tests |
diff --git a/docs/source/server.md b/docs/source/server.md
new file mode 100644
index 0000000000..69bdace27b
--- /dev/null
+++ b/docs/source/server.md
@@ -0,0 +1,30 @@
+# Demo server
+
+![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif)
+
+You can boot up a demo 🐸TTS server to run an inference with your models (make
+sure to install the additional dependencies with `pip install coqui-tts[server]`).
+Note that the server is not optimized for performance.
+
+The demo server provides pretty much the same interface as the CLI command.
+
+```bash
+tts-server -h # see the help
+tts-server --list_models  # list the available models.
+```
+
+Run a TTS model, from the release models list, with its default vocoder.
+If the model you choose is a multi-speaker or multilingual TTS model, you can
+select different speakers and languages on the Web interface and synthesize
+speech.
+
+```bash
+tts-server --model_name "<type>/<language>/<dataset>/<model_name>"
+```
+
+Run a TTS and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model.
+
+```bash
+tts-server --model_name "<type>/<language>/<dataset>/<model_name>" \
+           --vocoder_name "<type>/<language>/<dataset>/<model_name>"
+```
diff --git a/docs/source/finetuning.md b/docs/source/training/finetuning.md
similarity index 91%
rename from docs/source/finetuning.md
rename to docs/source/training/finetuning.md
index 548e385ec7..fa2ed34a54 100644
--- a/docs/source/finetuning.md
+++ b/docs/source/training/finetuning.md
@@ -1,4 +1,4 @@
-# Fine-tuning a 🐸 TTS model
+# Fine-tuning a model
 
 ## Fine-tuning
 
@@ -21,17 +21,21 @@ them and fine-tune it for your own dataset. This will help you in two main ways:
     Fine-tuning comes to the rescue in this case. You can take one of our pre-trained models and fine-tune it on your own
     speech dataset and achieve reasonable results with only a couple of hours of data.
 
-    However, note that, fine-tuning does not ensure great results. The model performance still depends on the
-    {ref}`dataset quality <what_makes_a_good_dataset>` and the hyper-parameters you choose for fine-tuning. Therefore,
+    However, note that, fine-tuning does not ensure great results. The model
+    performance still depends on the [dataset quality](../datasets/what_makes_a_good_dataset.md)
+    and the hyper-parameters you choose for fine-tuning. Therefore,
     it still takes a bit of tinkering.
 
 
 ## Steps to fine-tune a 🐸 TTS model
 
+```{note} XTTS has separate fine-tuning scripts, see [here](../models/xtts.md#training).
+```
+
 1. Setup your dataset.
 
     You need to format your target dataset in a certain way so that 🐸TTS data loader will be able to load it for the
-    training. Please see {ref}`this page <formatting_your_dataset>` for more information about formatting.
+    training. Please see [this page](../datasets/formatting_your_dataset.md) for more information about formatting.
 
 2. Choose the model you want to fine-tune.
 
@@ -47,7 +51,8 @@ them and fine-tune it for your own dataset. This will help you in two main ways:
 
     You should choose the model based on your requirements. Some models are fast and some are better in speech quality.
     One lazy way to test a model is running the model on the hardware you want to use and see how it works. For
-    simple testing, you can use the `tts` command on the terminal. For more info see {ref}`here <synthesizing_speech>`.
+    simple testing, you can use the `tts` command on the terminal. For more info
+    see [here](../inference.md).
 
 3. Download the model.
 
diff --git a/docs/source/training/index.md b/docs/source/training/index.md
new file mode 100644
index 0000000000..b09f9cadcb
--- /dev/null
+++ b/docs/source/training/index.md
@@ -0,0 +1,13 @@
+# Training and fine-tuning
+
+The following pages show you how to train and fine-tune Coqui models:
+
+```{toctree}
+:maxdepth: 1
+
+training_a_model
+finetuning
+```
+
+Also see the [XTTS page](../models/xtts.md#training) if you want to fine-tune
+that model.
diff --git a/docs/source/training_a_model.md b/docs/source/training/training_a_model.md
similarity index 92%
rename from docs/source/training_a_model.md
rename to docs/source/training/training_a_model.md
index 989a57042a..22505ccb17 100644
--- a/docs/source/training_a_model.md
+++ b/docs/source/training/training_a_model.md
@@ -1,4 +1,4 @@
-# Training a Model
+# Training a model
 
 1. Decide the model you want to use.
 
@@ -11,11 +11,10 @@
 
 3. Check the recipes.
 
-    Recipes are located under `TTS/recipes/`. They do not promise perfect models but they provide a good start point for
-    `Nervous Beginners`.
+    Recipes are located under `TTS/recipes/`. They do not promise perfect models but they provide a good start point.
     A recipe for `GlowTTS` using `LJSpeech` dataset looks like below. Let's be creative and call this `train_glowtts.py`.
 
-    ```{literalinclude} ../../recipes/ljspeech/glow_tts/train_glowtts.py
+    ```{literalinclude} ../../../recipes/ljspeech/glow_tts/train_glowtts.py
     ```
 
     You need to change fields of the `BaseDatasetConfig` to match your dataset and then update `GlowTTSConfig`
@@ -113,7 +112,7 @@
 
     Note that different models have different metrics, visuals and outputs.
 
-    You should also check the [FAQ page](https://github.com/coqui-ai/TTS/wiki/FAQ) for common problems and solutions
+    You should also check the [FAQ page](../faq.md) for common problems and solutions
     that occur in a training.
 
 7. Use your best model for inference.
@@ -132,7 +131,7 @@
     In the example above, we trained a `GlowTTS` model, but the same workflow applies to all the other 🐸TTS models.
 
 
-# Multi-speaker Training
+## Multi-speaker Training
 
 Training a multi-speaker model is mostly the same as training a single-speaker model.
 You need to specify a couple of configuration parameters, initiate a `SpeakerManager` instance and pass it to the model.
@@ -142,5 +141,5 @@ d-vectors. For using d-vectors, you first need to compute the d-vectors using th
 
 The same Glow-TTS model above can be trained on a multi-speaker VCTK dataset with the script below.
 
-```{literalinclude} ../../recipes/vctk/glow_tts/train_glow_tts.py
+```{literalinclude} ../../../recipes/vctk/glow_tts/train_glow_tts.py
 ```
diff --git a/docs/source/tutorial_for_nervous_beginners.md b/docs/source/tutorial_for_nervous_beginners.md
index b417c4c45a..5e5eac0e0a 100644
--- a/docs/source/tutorial_for_nervous_beginners.md
+++ b/docs/source/tutorial_for_nervous_beginners.md
@@ -1,24 +1,40 @@
-# Tutorial For Nervous Beginners
+# Tutorial for nervous beginners
 
-## Installation
+First [install](installation.md) Coqui TTS.
 
-User friendly installation. Recommended only for synthesizing voice.
+## Synthesizing Speech
+
+You can run `tts` and synthesize speech directly on the terminal.
 
 ```bash
-$ pip install coqui-tts
+$ tts -h # see the help
+$ tts --list_models  # list the available models.
 ```
 
-Developer friendly installation.
+![cli.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/tts_cli.gif)
+
+
+You can call `tts-server` to start a local demo server that you can open on
+your favorite web browser and 🗣️ (make sure to install the additional
+dependencies with `pip install coqui-tts[server]`).
 
 ```bash
-$ git clone https://github.com/idiap/coqui-ai-TTS
-$ cd coqui-ai-TTS
-$ pip install -e .
+$ tts-server -h # see the help
+$ tts-server --list_models  # list the available models.
 ```
+![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif)
+
+See [this page](inference.md) for more details on synthesizing speech with the
+CLI, server or Python API.
 
 ## Training a `tts` Model
 
-A breakdown of a simple script that trains a GlowTTS model on the LJspeech dataset. See the comments for more details.
+```{note} XTTS has separate fine-tuning scripts, see [here](models/xtts.md#training).
+```
+
+A breakdown of a simple script that trains a GlowTTS model on the LJspeech
+dataset. For a more in-depth guide to training and fine-tuning also see [this
+page](training/index.md).
 
 ### Pure Python Way
 
@@ -99,25 +115,3 @@ We still support running training from CLI like in the old days. The same traini
 ```
 
 ❗️ Note that you can also use ```train_vocoder.py``` as the ```tts``` models above.
-
-## Synthesizing Speech
-
-You can run `tts` and synthesize speech directly on the terminal.
-
-```bash
-$ tts -h # see the help
-$ tts --list_models  # list the available models.
-```
-
-![cli.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/tts_cli.gif)
-
-
-You can call `tts-server` to start a local demo server that you can open on
-your favorite web browser and 🗣️ (make sure to install the additional
-dependencies with `pip install coqui-tts[server]`).
-
-```bash
-$ tts-server -h # see the help
-$ tts-server --list_models  # list the available models.
-```
-![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif)
diff --git a/docs/source/vc.md b/docs/source/vc.md
new file mode 100644
index 0000000000..8b45d9393a
--- /dev/null
+++ b/docs/source/vc.md
@@ -0,0 +1,84 @@
+# Voice conversion
+
+## Overview
+
+Voice conversion (VC) converts the voice in a speech signal from one speaker to
+that of another speaker while preserving the linguistic content. Coqui supports
+both voice conversion on its own, as well as applying it after speech synthesis
+to enable multi-speaker output with single-speaker TTS models.
+
+### Python API
+
+Converting the voice in `source_wav` to the voice of `target_wav` (the latter
+can also be a list of files):
+
+```python
+from TTS.api import TTS
+
+tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda")
+tts.voice_conversion_to_file(
+  source_wav="my/source.wav",
+  target_wav="my/target.wav",
+  file_path="output.wav"
+)
+```
+
+Voice cloning by combining TTS and VC. The FreeVC model is used for voice
+conversion after synthesizing speech.
+
+```python
+
+tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
+tts.tts_with_vc_to_file(
+  "Wie sage ich auf Italienisch, dass ich dich liebe?",
+  speaker_wav=["target1.wav", "target2.wav"],
+  file_path="output.wav"
+)
+```
+
+Some models, including [XTTS](models/xtts.md), support voice cloning directly
+and a separate voice conversion step is not necessary:
+
+```python
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
+wav = tts.tts(
+  text="Hello world!",
+  speaker_wav="my/cloning/audio.wav",
+  language="en"
+)
+```
+
+### CLI
+
+```sh
+tts --out_path output/path/speech.wav \
+    --model_name "<language>/<dataset>/<model_name>" \
+    --source_wav <path/to/speaker/wav> \
+    --target_wav <path/to/reference/wav1> <path/to/reference/wav2>
+```
+
+## Pretrained models
+
+Coqui includes the following pretrained voice conversion models. Training is not
+supported.
+
+### FreeVC
+
+- `voice_conversion_models/multilingual/vctk/freevc24`
+
+Adapted from: https://github.com/OlaWod/FreeVC
+
+### kNN-VC
+
+- `voice_conversion_models/multilingual/multi-dataset/knnvc`
+
+At least 1-5 minutes of target speaker data are recommended.
+
+Adapted from: https://github.com/bshall/knn-vc
+
+### OpenVoice
+
+- `voice_conversion_models/multilingual/multi-dataset/openvoice_v1`
+- `voice_conversion_models/multilingual/multi-dataset/openvoice_v2`
+
+Adapted from: https://github.com/myshell-ai/OpenVoice
diff --git a/hubconf.py b/hubconf.py
index 6e10928265..b49c9d6bcc 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,4 +1,14 @@
-dependencies = ["torch", "gdown", "pysbd", "gruut", "anyascii", "pypinyin", "coqpit", "mecab-python3", "unidic-lite"]
+dependencies = [
+    "torch",
+    "gdown",
+    "pysbd",
+    "gruut",
+    "anyascii",
+    "pypinyin",
+    "coqpit-config",
+    "mecab-python3",
+    "unidic-lite",
+]
 import torch
 
 from TTS.utils.manage import ModelManager
@@ -39,5 +49,5 @@ def tts(model_name="tts_models/en/ljspeech/tacotron2-DCA", vocoder_name=None, us
 
 
 if __name__ == "__main__":
-    synthesizer = torch.hub.load("coqui-ai/TTS:dev", "tts", source="github")
+    synthesizer = torch.hub.load("idiap/coqui-ai-TTS:dev", "tts", source="github")
     synthesizer.tts("This is a test!")
diff --git a/notebooks/dataset_analysis/analyze.py b/notebooks/dataset_analysis/analyze.py
index 4855886efd..44bf25c071 100644
--- a/notebooks/dataset_analysis/analyze.py
+++ b/notebooks/dataset_analysis/analyze.py
@@ -43,7 +43,7 @@ def process_meta_data(path):
     meta_data = {}
 
     # load meta data
-    with open(path, "r", encoding="utf-8") as f:
+    with open(path, encoding="utf-8") as f:
         data = csv.reader(f, delimiter="|")
         for row in data:
             frames = int(row[2])
@@ -58,7 +58,7 @@ def process_meta_data(path):
                     "utt": utt,
                     "frames": frames,
                     "audio_len": audio_len,
-                    "row": "{}|{}|{}|{}".format(row[0], row[1], row[2], row[3]),
+                    "row": f"{row[0]}|{row[1]}|{row[2]}|{row[3]}",
                 }
             )
 
@@ -156,7 +156,7 @@ def plot_phonemes(train_path, cmu_dict_path, save_path):
 
     phonemes = {}
 
-    with open(train_path, "r", encoding="utf-8") as f:
+    with open(train_path, encoding="utf-8") as f:
         data = csv.reader(f, delimiter="|")
         phonemes["None"] = 0
         for row in data:
diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/plot_embeddings_umap.ipynb
similarity index 56%
rename from notebooks/PlotUmapLibriTTS.ipynb
rename to notebooks/plot_embeddings_umap.ipynb
index 1e29790b9e..b661f85673 100644
--- a/notebooks/PlotUmapLibriTTS.ipynb
+++ b/notebooks/plot_embeddings_umap.ipynb
@@ -4,13 +4,20 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Overview\n",
+    "# Overview\n",
     "\n",
     "This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n",
     "\n",
     "Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -19,63 +26,47 @@
    "source": [
     "import os\n",
     "import glob\n",
+    "import random\n",
+    "from collections import defaultdict\n",
+    "from pathlib import Path\n",
+    "\n",
     "import numpy as np\n",
+    "import torch\n",
     "import umap\n",
     "\n",
-    "from TTS.utils.audio import AudioProcessor\n",
+    "from TTS.bin.compute_embeddings import compute_embeddings\n",
     "from TTS.config import load_config\n",
+    "from TTS.config.shared_configs import BaseDatasetConfig\n",
+    "from TTS.tts.datasets import load_tts_samples\n",
+    "from TTS.utils.audio import AudioProcessor\n",
     "\n",
     "from bokeh.io import output_notebook, show\n",
     "from bokeh.plotting import figure\n",
     "from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n",
     "from bokeh.transform import factor_cmap\n",
-    "from bokeh.palettes import Category10"
+    "from bokeh.palettes import Category10\n",
+    "\n",
+    "output_notebook()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n",
+    "For larger sets of speakers, you can use `Category20`, but you need to change it in the `pal` variable too\n",
     "\n",
-    "List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n",
+    "List of Bokeh palettes here: https://docs.bokeh.org/en/latest/docs/reference/palettes.html\n",
     "\n",
     "**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "output_notebook()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You should also adjust all the path constants to point at the relevant locations for you locally"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
-    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n",
-    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
-    "\n",
-    "# My single speaker locations\n",
-    "#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n",
-    "#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n",
+    "## Config\n",
     "\n",
-    "# My multi speaker locations\n",
-    "EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n",
-    "AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\""
+    "You should adjust all the paths to point at the relevant locations for you locally."
    ]
   },
   {
@@ -84,7 +75,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!ls -1 $MODEL_RUN_PATH"
+    "# Dataset\n",
+    "formatter_name = \"ljspeech\"\n",
+    "dataset_name = \"ljspeech\"\n",
+    "dataset_path = \"path/to/LJSpeech-1.1\"\n",
+    "meta_file_train = \"metadata.csv\"\n",
+    "\n",
+    "# Speaker encoder\n",
+    "se_model_path = \"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar\"\n",
+    "se_config_path = \"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json\"\n",
+    "embedding_path = \"speakers.pth\""
    ]
   },
   {
@@ -93,15 +93,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "CONFIG = load_config(CONFIG_PATH)\n",
-    "ap = AudioProcessor(**CONFIG['audio'])"
+    "dataset_config = BaseDatasetConfig()\n",
+    "dataset_config.formatter = formatter_name\n",
+    "dataset_config.dataset_name = dataset_name\n",
+    "dataset_config.path = dataset_path\n",
+    "dataset_config.meta_file_train = meta_file_train\n",
+    "\n",
+    "meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=False)\n",
+    "utt_to_wav = {\n",
+    "    item[\"audio_unique_name\"]: str(Path(item[\"audio_file\"]).relative_to(dataset_path)) for item in meta_data_train\n",
+    "}"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Bring in the embeddings created by **compute_embeddings.py**"
+    "## Compute embeddings\n",
+    "\n",
+    "You can skip this if you have already computed embeddings with `TTS/bin/compute_embeddings.py`"
    ]
   },
   {
@@ -110,33 +120,38 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n",
-    "print(f'Embeddings found: {len(embed_files)}')"
+    "compute_embeddings(\n",
+    "    model_path=se_model_path,\n",
+    "    config_path=se_config_path,\n",
+    "    output_path=embedding_path,\n",
+    "    formatter_name=formatter_name,\n",
+    "    dataset_name=dataset_name,\n",
+    "    dataset_path=dataset_path,\n",
+    "    meta_file_train=meta_file_train,\n",
+    ")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Check that we did indeed find an embedding"
+    "## Plot Umap"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "embed_files[0]"
+    "Bring in the embeddings created by `TTS/bin/compute_embeddings.py`"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### Process the speakers\n",
-    "\n",
-    "Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)"
+    "embeddings = torch.load(embedding_path, weights_only=True)"
    ]
   },
   {
@@ -145,15 +160,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n",
-    "speaker_to_utter = {}\n",
-    "for embed_file in embed_files:\n",
-    "    speaker_path = os.path.dirname(os.path.dirname(embed_file))\n",
-    "    try:\n",
-    "        speaker_to_utter[speaker_path].append(embed_file)\n",
-    "    except:\n",
-    "        speaker_to_utter[speaker_path]=[embed_file]\n",
-    "print(f'Speaker count: {len(speaker_paths)}')"
+    "speakers = set()\n",
+    "speaker_to_utter = defaultdict(list)\n",
+    "for idx, embedding in embeddings.items():\n",
+    "    speaker = embedding[\"name\"]\n",
+    "    speakers.add(speaker)\n",
+    "    speaker_to_utter[speaker].append(idx)\n",
+    "print(f\"Speaker count: {len(speakers)}\")"
    ]
   },
   {
@@ -175,35 +188,32 @@
     "labels = []\n",
     "locations = []\n",
     "\n",
-    "# single speaker \n",
-    "#num_speakers = 1\n",
-    "#num_utters = 1000\n",
+    "# single speaker\n",
+    "num_speakers = 1\n",
+    "num_utters = 1000\n",
     "\n",
     "# multi speaker\n",
-    "num_speakers = 10\n",
-    "num_utters = 20\n",
+    "# num_speakers = 10\n",
+    "# num_utters = 20\n",
     "\n",
-    "\n",
-    "speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n",
+    "speaker_idxs = random.sample(list(speakers), num_speakers)\n",
     "\n",
     "for speaker_num, speaker_idx in enumerate(speaker_idxs):\n",
-    "    speaker_path = speaker_paths[speaker_idx]\n",
-    "    speakers_utter = speaker_to_utter[speaker_path]\n",
-    "    utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n",
+    "    speakers_utter = speaker_to_utter[speaker_idx]\n",
+    "    utter_idxs = random.sample(speakers_utter, num_utters)\n",
     "    for utter_idx in utter_idxs:\n",
-    "            embed_path = speaker_to_utter[speaker_path][utter_idx]\n",
-    "            embed = np.load(embed_path)\n",
-    "            embeds.append(embed)\n",
-    "            labels.append(str(speaker_num))\n",
-    "            locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n",
-    "embeds = np.concatenate(embeds)"
+    "        embed = np.array(embeddings[utter_idx][\"embedding\"])\n",
+    "        embeds.append(embed)\n",
+    "        labels.append(speaker_idx)\n",
+    "        locations.append(utt_to_wav[utter_idx])\n",
+    "embeds = np.stack(embeds)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Load embeddings with UMAP"
+    "### Load embeddings with UMAP"
    ]
   },
   {
@@ -222,9 +232,7 @@
    "source": [
     "### Interactively charting the data in Bokeh\n",
     "\n",
-    "Set up various details for Bokeh to plot the data\n",
-    "\n",
-    "You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n",
+    "You can use the regular Bokeh [tools](https://docs.bokeh.org/en/latest/docs/user_guide/interaction/tools.html) to explore the data, with reset setting it back to normal\n",
     "\n",
     "Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n",
     "\n",
@@ -238,22 +246,17 @@
    "outputs": [],
    "source": [
     "source_wav_stems = ColumnDataSource(\n",
-    "        data=dict(\n",
-    "            x = projection.T[0].tolist(),\n",
-    "            y = projection.T[1].tolist(),\n",
-    "            desc=locations,\n",
-    "            label=labels\n",
-    "        )\n",
+    "    data=dict(\n",
+    "        x=projection.T[0].tolist(),\n",
+    "        y=projection.T[1].tolist(),\n",
+    "        desc=locations,\n",
+    "        label=labels,\n",
     "    )\n",
+    ")\n",
     "\n",
-    "hover = HoverTool(\n",
-    "        tooltips=[\n",
-    "            (\"file\", \"@desc\"),\n",
-    "            (\"speaker\", \"@label\"),\n",
-    "        ]\n",
-    "    )\n",
+    "hover = HoverTool(tooltips=[(\"file\", \"@desc\"), (\"speaker\", \"@label\")])\n",
     "\n",
-    "# optionally consider adding these to the tooltips if you want additional detail\n",
+    "### Optionally consider adding these to the tooltips if you want additional detail\n",
     "# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n",
     "# for the index of the embedding / wav file: (\"index\", \"$index\"),\n",
     "\n",
@@ -261,10 +264,13 @@
     "pal_size = max(len(factors), 3)\n",
     "pal = Category10[pal_size]\n",
     "\n",
-    "p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n",
-    "\n",
-    "\n",
-    "p.circle('x', 'y',  source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n",
+    "p = figure(width=600, height=400, tools=[hover, BoxZoomTool(), ResetTool(), TapTool()])\n",
+    "p.scatter(\n",
+    "    \"x\",\n",
+    "    \"y\",\n",
+    "    source=source_wav_stems,\n",
+    "    color=factor_cmap(\"label\", palette=pal, factors=factors),\n",
+    ")\n",
     "\n",
     "url = \"http://localhost:8000/@desc\"\n",
     "taptool = p.select(type=TapTool)\n",
@@ -292,7 +298,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%cd $AUDIO_PATH\n",
+    "%cd $dataset_path\n",
     "%pwd\n",
     "!python -m http.server"
    ]
@@ -300,7 +306,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -314,7 +320,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index d66f33d602..821ddc78d9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,10 +25,10 @@ build-backend = "hatchling.build"
 
 [project]
 name = "coqui-tts"
-version = "0.24.3"
+version = "0.25.3"
 description = "Deep learning for Text to Speech."
 readme = "README.md"
-requires-python = ">=3.9, <3.13"
+requires-python = ">=3.10, <3.13"
 license = {text = "MPL-2.0"}
 authors = [
     {name = "Eren Gölge", email = "egolge@coqui.ai"}
@@ -39,7 +39,6 @@ maintainers = [
 classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -70,30 +69,31 @@ dependencies = [
     "pyyaml>=6.0",
     "fsspec[http]>=2023.6.0",
     "packaging>=23.1",
+    "typing_extensions>=4.10",
     # Inference
     "pysbd>=0.3.4",
     # Training
     "matplotlib>=3.7.0",
     # Coqui stack
-    "coqui-tts-trainer>=0.1.4,<0.2.0",
-    "coqpit>=0.0.16",
+    "coqui-tts-trainer>=0.2.0,<0.3.0",
+    "coqpit-config>=0.2.0,<0.3.0",
     "monotonic-alignment-search>=0.1.0",
     # Gruut + supported languages
     "gruut[de,es,fr]>=2.4.0",
     # Tortoise
     "einops>=0.6.0",
-    "transformers>=4.43.0,<=4.46.2",
+    "transformers>=4.47.0",
     # Bark
     "encodec>=0.1.1",
     # XTTS
-    "num2words>=0.5.11",
-    "spacy[ja]>=3,<3.8",
+    "num2words>=0.5.14",
+    "spacy[ja]>=3.2,<3.8",
 ]
 
 [project.optional-dependencies]
 # Only used in notebooks
 notebooks = [
-    "bokeh==1.4.0",
+    "bokeh>=3.0.3",
     "pandas>=1.4,<2.0",
     "umap-learn>=0.5.1",
 ]
@@ -115,7 +115,7 @@ ko = [
 ]
 # Japanese
 ja = [
-    "mecab-python3>=1.0.2",
+    "mecab-python3>=1.0.6",
     "unidic-lite==1.0.8",
     "cutlet>=0.2.0",
 ]
@@ -135,20 +135,19 @@ all = [
 
 [dependency-groups]
 dev = [
-    "black==24.2.0",
     "coverage[toml]>=7",
-    "nose2>=0.15",
-    "pre-commit>=3",
-    "ruff==0.7.0",
+    "pre-commit>=4",
+    "pytest>=8",
+    "ruff==0.9.1",
 ]
 # Dependencies for building the documentation
 docs = [
-    "furo>=2023.5.20",
-    "myst-parser==2.0.0",
-    "sphinx==7.2.5",
+    "furo>=2024.8.6",
+    "myst-parser==3.0.1",
+    "sphinx==7.4.7",
     "sphinx_inline_tabs>=2023.4.21",
-    "sphinx_copybutton>=0.1",
-    "linkify-it-py>=2.0.0",
+    "sphinx_copybutton>=0.5.2",
+    "linkify-it-py>=2.0.3",
 ]
 
 [project.urls]
@@ -173,7 +172,6 @@ exclude = [
     "/.readthedocs.yml",
     "/Makefile",
     "/dockerfiles",
-    "/run_bash_tests.sh",
     "/scripts",
     "/tests",
 ]
@@ -192,6 +190,7 @@ lint.extend-select = [
     "F704", # yield-outside-function
     "F706", # return-outside-function
     "F841", # unused-variable
+    "G004", # no f-string in logging
     "I", # import sorting
     "PIE790", # unnecessary-pass
     "PLC",
@@ -201,6 +200,7 @@ lint.extend-select = [
     "PLR0911", # too-many-return-statements
     "PLR1711", # useless-return
     "PLW",
+    "UP", # pyupgrade
     "W291", # trailing-whitespace
     "NPY201",  # NumPy 2.0 deprecation
 ]
@@ -231,14 +231,10 @@ max-returns = 7
     "E402", # module level import not at top of file
 ]
 
-[tool.black]
-line-length = 120
-target-version = ['py39']
+[tool.coverage.report]
+skip_covered = true
+skip_empty = true
 
 [tool.coverage.run]
 parallel = true
 source = ["TTS"]
-
-[tool.cibuildwheel]
-build = "cp*"
-skip = "*-win32 *i686 *musllinux*"
diff --git a/recipes/ljspeech/xtts_v1/train_gpt_xtts.py b/recipes/ljspeech/xtts_v1/train_gpt_xtts.py
index d31ec8f1ed..a077a18064 100644
--- a/recipes/ljspeech/xtts_v1/train_gpt_xtts.py
+++ b/recipes/ljspeech/xtts_v1/train_gpt_xtts.py
@@ -4,7 +4,8 @@
 
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
+from TTS.tts.models.xtts import XttsAudioConfig
 from TTS.utils.manage import ModelManager
 
 # Logging parameters
diff --git a/recipes/ljspeech/xtts_v2/train_gpt_xtts.py b/recipes/ljspeech/xtts_v2/train_gpt_xtts.py
index ccaa97f1e4..362f45008e 100644
--- a/recipes/ljspeech/xtts_v2/train_gpt_xtts.py
+++ b/recipes/ljspeech/xtts_v2/train_gpt_xtts.py
@@ -4,7 +4,8 @@
 
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
+from TTS.tts.models.xtts import XttsAudioConfig
 from TTS.utils.manage import ModelManager
 
 # Logging parameters
diff --git a/run_bash_tests.sh b/run_bash_tests.sh
deleted file mode 100755
index 2f5ba88934..0000000000
--- a/run_bash_tests.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-set -e
-TF_CPP_MIN_LOG_LEVEL=3
-
-# runtime bash based tests
-# TODO: move these to python
-./tests/bash_tests/test_demo_server.sh && \
-./tests/bash_tests/test_compute_statistics.sh
diff --git a/scripts/sync_readme.py b/scripts/sync_readme.py
index 584286814b..97256bca6d 100644
--- a/scripts/sync_readme.py
+++ b/scripts/sync_readme.py
@@ -22,8 +22,12 @@ def sync_readme():
     new_content = replace_between_markers(orig_content, "tts-readme", description.strip())
     if args.check:
         if orig_content != new_content:
-            print("README.md is out of sync; please edit TTS/bin/TTS_README.md and run scripts/sync_readme.py")
+            print(
+                "README.md is out of sync; please reconcile README.md and TTS/bin/synthesize.py and run scripts/sync_readme.py"
+            )
             exit(42)
+        print("All good, files in sync")
+        exit(0)
     readme_path.write_text(new_content)
     print("Updated README.md")
 
diff --git a/tests/__init__.py b/tests/__init__.py
index f0a8b2f118..0ee20a92df 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,5 +1,8 @@
 import os
+from collections.abc import Callable
+from typing import Optional
 
+import pytest
 from trainer.generic_utils import get_cuda
 
 from TTS.config import BaseDatasetConfig
@@ -39,9 +42,10 @@ def get_tests_output_path():
     return path
 
 
-def run_cli(command):
-    exit_status = os.system(command)
-    assert exit_status == 0, f" [!] command `{command}` failed."
+def run_main(main_func: Callable, args: list[str] | None = None, expected_code: int = 0):
+    with pytest.raises(SystemExit) as exc_info:
+        main_func(args)
+    assert exc_info.value.code == expected_code
 
 
 def get_test_data_config():
diff --git a/tests/aux_tests/test_audio_processor.py b/tests/aux_tests/test_audio_processor.py
index 5b1fa9d38a..6caf6db30d 100644
--- a/tests/aux_tests/test_audio_processor.py
+++ b/tests/aux_tests/test_audio_processor.py
@@ -1,190 +1,194 @@
 import os
-import unittest
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+import pytest
+
+from tests import get_tests_input_path
 from TTS.config import BaseAudioConfig
 from TTS.utils.audio.processor import AudioProcessor
 
-TESTS_PATH = get_tests_path()
-OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
-os.makedirs(OUT_PATH, exist_ok=True)
 conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1)
 
 
-# pylint: disable=protected-access
-class TestAudio(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.ap = AudioProcessor(**conf)
-
-    def test_audio_synthesis(self):
-        """1. load wav
-        2. set normalization parameters
-        3. extract mel-spec
-        4. invert to wav and save the output
-        """
-        print(" > Sanity check for the process wav -> mel -> wav")
-
-        def _test(max_norm, signal_norm, symmetric_norm, clip_norm):
-            self.ap.max_norm = max_norm
-            self.ap.signal_norm = signal_norm
-            self.ap.symmetric_norm = symmetric_norm
-            self.ap.clip_norm = clip_norm
-            wav = self.ap.load_wav(WAV_FILE)
-            mel = self.ap.melspectrogram(wav)
-            wav_ = self.ap.inv_melspectrogram(mel)
-            file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav".format(
-                max_norm, signal_norm, symmetric_norm, clip_norm
-            )
-            print(" | > Creating wav file at : ", file_name)
-            self.ap.save_wav(wav_, OUT_PATH + file_name)
-
-        # maxnorm = 1.0
-        _test(1.0, False, False, False)
-        _test(1.0, True, False, False)
-        _test(1.0, True, True, False)
-        _test(1.0, True, False, True)
-        _test(1.0, True, True, True)
-        # maxnorm = 4.0
-        _test(4.0, False, False, False)
-        _test(4.0, True, False, False)
-        _test(4.0, True, True, False)
-        _test(4.0, True, False, True)
-        _test(4.0, True, True, True)
-
-    def test_normalize(self):
-        """Check normalization and denormalization for range values and consistency"""
-        print(" > Testing normalization and denormalization.")
-        wav = self.ap.load_wav(WAV_FILE)
-        wav = self.ap.sound_norm(wav)  # normalize audio to get abetter normalization range below.
-        self.ap.signal_norm = False
-        x = self.ap.melspectrogram(wav)
-        x_old = x
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = False
-        self.ap.clip_norm = False
-        self.ap.max_norm = 4.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-        assert (x_old - x).sum() == 0
-        # check value range
-        assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
-        assert x_norm.min() >= 0 - 1, x_norm.min()
-        # check denorm.
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3, (x - x_).mean()
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = False
-        self.ap.clip_norm = True
-        self.ap.max_norm = 4.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        # check value range
-        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
-        assert x_norm.min() >= 0, x_norm.min()
-        # check denorm.
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3, (x - x_).mean()
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = True
-        self.ap.clip_norm = False
-        self.ap.max_norm = 4.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        # check value range
-        assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
-        assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min()  # pylint: disable=invalid-unary-operand-type
-        assert x_norm.min() <= 0, x_norm.min()
-        # check denorm.
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3, (x - x_).mean()
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = True
-        self.ap.clip_norm = True
-        self.ap.max_norm = 4.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        # check value range
-        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
-        assert x_norm.min() >= -self.ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
-        assert x_norm.min() <= 0, x_norm.min()
-        # check denorm.
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3, (x - x_).mean()
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = False
-        self.ap.max_norm = 1.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
-        assert x_norm.min() >= 0, x_norm.min()
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = True
-        self.ap.max_norm = 1.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
-        assert x_norm.min() >= -self.ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
-        assert x_norm.min() < 0, x_norm.min()
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3
-
-    def test_scaler(self):
-        scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy")
-        conf.stats_path = scaler_stats_path
-        conf.preemphasis = 0.0
-        conf.do_trim_silence = True
-        conf.signal_norm = True
-
-        ap = AudioProcessor(**conf)
-        mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path)
-        ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
-
-        self.ap.signal_norm = False
-        self.ap.preemphasis = 0.0
-
-        # test scaler forward and backward transforms
-        wav = self.ap.load_wav(WAV_FILE)
-        mel_reference = self.ap.melspectrogram(wav)
-        mel_norm = ap.melspectrogram(wav)
-        mel_denorm = ap.denormalize(mel_norm)
-        assert abs(mel_reference - mel_denorm).max() < 1e-4
-
-    def test_compute_f0(self):  # pylint: disable=no-self-use
-        ap = AudioProcessor(**conf)
-        wav = ap.load_wav(WAV_FILE)
-        pitch = ap.compute_f0(wav)
-        mel = ap.melspectrogram(wav)
-        assert pitch.shape[0] == mel.shape[1]
+@pytest.fixture
+def ap():
+    """Set up audio processor."""
+    return AudioProcessor(**conf)
+
+
+norms = [
+    # maxnorm = 1.0
+    (1.0, False, False, False),
+    (1.0, True, False, False),
+    (1.0, True, True, False),
+    (1.0, True, False, True),
+    (1.0, True, True, True),
+    # maxnorm = 4.0
+    (4.0, False, False, False),
+    (4.0, True, False, False),
+    (4.0, True, True, False),
+    (4.0, True, False, True),
+    (4.0, True, True, True),
+]
+
+
+@pytest.mark.parametrize("norms", norms)
+def test_audio_synthesis(tmp_path, ap, norms):
+    """1. load wav
+    2. set normalization parameters
+    3. extract mel-spec
+    4. invert to wav and save the output
+    """
+    print(" > Sanity check for the process wav -> mel -> wav")
+    max_norm, signal_norm, symmetric_norm, clip_norm = norms
+    ap.max_norm = max_norm
+    ap.signal_norm = signal_norm
+    ap.symmetric_norm = symmetric_norm
+    ap.clip_norm = clip_norm
+    wav = ap.load_wav(WAV_FILE)
+    mel = ap.melspectrogram(wav)
+    wav_ = ap.inv_melspectrogram(mel)
+    file_name = (
+        f"audio_test-melspec_max_norm_{max_norm}-signal_norm_{signal_norm}-"
+        f"symmetric_{symmetric_norm}-clip_norm_{clip_norm}.wav"
+    )
+    print(" | > Creating wav file at : ", file_name)
+    ap.save_wav(wav_, tmp_path / file_name)
+
+
+def test_normalize(ap):
+    """Check normalization and denormalization for range values and consistency"""
+    print(" > Testing normalization and denormalization.")
+    wav = ap.load_wav(WAV_FILE)
+    wav = ap.sound_norm(wav)  # normalize audio to get abetter normalization range below.
+    ap.signal_norm = False
+    x = ap.melspectrogram(wav)
+    x_old = x
+
+    ap.signal_norm = True
+    ap.symmetric_norm = False
+    ap.clip_norm = False
+    ap.max_norm = 4.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+    assert (x_old - x).sum() == 0
+    # check value range
+    assert x_norm.max() <= ap.max_norm + 1, x_norm.max()
+    assert x_norm.min() >= 0 - 1, x_norm.min()
+    # check denorm.
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3, (x - x_).mean()
+
+    ap.signal_norm = True
+    ap.symmetric_norm = False
+    ap.clip_norm = True
+    ap.max_norm = 4.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    # check value range
+    assert x_norm.max() <= ap.max_norm, x_norm.max()
+    assert x_norm.min() >= 0, x_norm.min()
+    # check denorm.
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3, (x - x_).mean()
+
+    ap.signal_norm = True
+    ap.symmetric_norm = True
+    ap.clip_norm = False
+    ap.max_norm = 4.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    # check value range
+    assert x_norm.max() <= ap.max_norm + 1, x_norm.max()
+    assert x_norm.min() >= -ap.max_norm - 2, x_norm.min()  # pylint: disable=invalid-unary-operand-type
+    assert x_norm.min() <= 0, x_norm.min()
+    # check denorm.
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3, (x - x_).mean()
+
+    ap.signal_norm = True
+    ap.symmetric_norm = True
+    ap.clip_norm = True
+    ap.max_norm = 4.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    # check value range
+    assert x_norm.max() <= ap.max_norm, x_norm.max()
+    assert x_norm.min() >= -ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
+    assert x_norm.min() <= 0, x_norm.min()
+    # check denorm.
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3, (x - x_).mean()
+
+    ap.signal_norm = True
+    ap.symmetric_norm = False
+    ap.max_norm = 1.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    assert x_norm.max() <= ap.max_norm, x_norm.max()
+    assert x_norm.min() >= 0, x_norm.min()
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3
+
+    ap.signal_norm = True
+    ap.symmetric_norm = True
+    ap.max_norm = 1.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    assert x_norm.max() <= ap.max_norm, x_norm.max()
+    assert x_norm.min() >= -ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
+    assert x_norm.min() < 0, x_norm.min()
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3
+
+
+def test_scaler(ap):
+    scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy")
+    conf.stats_path = scaler_stats_path
+    conf.preemphasis = 0.0
+    conf.do_trim_silence = True
+    conf.signal_norm = True
+
+    ap = AudioProcessor(**conf)
+    mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path)
+    ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
+
+    ap.signal_norm = False
+    ap.preemphasis = 0.0
+
+    # test scaler forward and backward transforms
+    wav = ap.load_wav(WAV_FILE)
+    mel_reference = ap.melspectrogram(wav)
+    mel_norm = ap.melspectrogram(wav)
+    mel_denorm = ap.denormalize(mel_norm)
+    assert abs(mel_reference - mel_denorm).max() < 1e-4
+
+
+def test_compute_f0(ap):
+    wav = ap.load_wav(WAV_FILE)
+    pitch = ap.compute_f0(wav)
+    mel = ap.melspectrogram(wav)
+    assert pitch.shape[0] == mel.shape[1]
diff --git a/tests/aux_tests/test_compute_statistics.py b/tests/aux_tests/test_compute_statistics.py
new file mode 100644
index 0000000000..d6809eb480
--- /dev/null
+++ b/tests/aux_tests/test_compute_statistics.py
@@ -0,0 +1,10 @@
+from pathlib import Path
+
+from tests import get_tests_input_path, run_main
+from TTS.bin.compute_statistics import main
+
+
+def test_compute_statistics(tmp_path):
+    config_path = Path(get_tests_input_path()) / "test_glow_tts_config.json"
+    output_path = tmp_path / "scale_stats.npy"
+    run_main(main, ["--config_path", str(config_path), "--out_path", str(output_path)])
diff --git a/tests/aux_tests/test_extract_tts_spectrograms.py b/tests/aux_tests/test_extract_tts_spectrograms.py
index f2d119ac35..563c5dae02 100644
--- a/tests/aux_tests/test_extract_tts_spectrograms.py
+++ b/tests/aux_tests/test_extract_tts_spectrograms.py
@@ -1,67 +1,23 @@
-import os
-import unittest
+from pathlib import Path
 
+import pytest
 import torch
 
-from tests import get_tests_input_path, get_tests_output_path, run_cli
+from tests import get_tests_input_path, run_main
+from TTS.bin.extract_tts_spectrograms import main
 from TTS.config import load_config
 from TTS.tts.models import setup_model
 
 torch.manual_seed(1)
 
 
-# pylint: disable=protected-access
-class TestExtractTTSSpectrograms(unittest.TestCase):
-    @staticmethod
-    def test_GlowTTS():
-        # set paths
-        config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "glowtts.pth")
-        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
-        # load config
-        c = load_config(config_path)
-        # create model
-        model = setup_model(c)
-        # save model
-        torch.save({"model": model.state_dict()}, checkpoint_path)
-        # run test
-        run_cli(
-            f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"'
-        )
-        run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
+@pytest.mark.parametrize("model", ["glow_tts", "tacotron", "tacotron2"])
+def test_extract_tts_spectrograms(tmp_path, model):
+    config_path = str(Path(get_tests_input_path()) / f"test_{model}_config.json")
+    checkpoint_path = str(tmp_path / f"{model}.pth")
+    output_path = str(tmp_path / "output_extract_tts_spectrograms")
 
-    @staticmethod
-    def test_Tacotron2():
-        # set paths
-        config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "tacotron2.pth")
-        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
-        # load config
-        c = load_config(config_path)
-        # create model
-        model = setup_model(c)
-        # save model
-        torch.save({"model": model.state_dict()}, checkpoint_path)
-        # run test
-        run_cli(
-            f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"'
-        )
-        run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
-
-    @staticmethod
-    def test_Tacotron():
-        # set paths
-        config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "tacotron.pth")
-        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
-        # load config
-        c = load_config(config_path)
-        # create model
-        model = setup_model(c)
-        # save model
-        torch.save({"model": model.state_dict()}, checkpoint_path)
-        # run test
-        run_cli(
-            f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"'
-        )
-        run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
+    config = load_config(config_path)
+    model = setup_model(config)
+    torch.save({"model": model.state_dict()}, checkpoint_path)
+    run_main(main, ["--config_path", config_path, "--checkpoint_path", checkpoint_path, "--output_path", output_path])
diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py
index 018679f573..53298cdebd 100644
--- a/tests/aux_tests/test_find_unique_phonemes.py
+++ b/tests/aux_tests/test_find_unique_phonemes.py
@@ -1,16 +1,12 @@
-import os
-import unittest
-
 import torch
 
-from tests import get_tests_output_path, run_cli
+from tests import run_main
+from TTS.bin.find_unique_phonemes import main
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 
 torch.manual_seed(1)
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-
 dataset_config_en = BaseDatasetConfig(
     formatter="ljspeech",
     meta_file_train="metadata.csv",
@@ -30,52 +26,26 @@
 """
 
 
-# pylint: disable=protected-access
-class TestFindUniquePhonemes(unittest.TestCase):
-    @staticmethod
-    def test_espeak_phonemes():
-        # prepare the config
-        config = VitsConfig(
-            batch_size=2,
-            eval_batch_size=2,
-            num_loader_workers=0,
-            num_eval_loader_workers=0,
-            text_cleaner="english_cleaners",
-            use_phonemes=True,
-            phoneme_language="en-us",
-            phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-            run_eval=True,
-            test_delay_epochs=-1,
-            epochs=1,
-            print_step=1,
-            print_eval=True,
-            datasets=[dataset_config_en],
-        )
-        config.save_json(config_path)
-
-        # run test
-        run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"')
-
-    @staticmethod
-    def test_no_espeak_phonemes():
-        # prepare the config
-        config = VitsConfig(
-            batch_size=2,
-            eval_batch_size=2,
-            num_loader_workers=0,
-            num_eval_loader_workers=0,
-            text_cleaner="english_cleaners",
-            use_phonemes=True,
-            phoneme_language="en-us",
-            phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-            run_eval=True,
-            test_delay_epochs=-1,
-            epochs=1,
-            print_step=1,
-            print_eval=True,
-            datasets=[dataset_config_en],
-        )
-        config.save_json(config_path)
-
-        # run test
-        run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"')
+def test_find_phonemes(tmp_path):
+    # prepare the config
+    config_path = str(tmp_path / "test_model_config.json")
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        datasets=[dataset_config_en],
+    )
+    config.save_json(config_path)
+
+    # run test
+    run_main(main, ["--config_path", config_path])
diff --git a/tests/tts_tests/test_helpers.py b/tests/aux_tests/test_helpers.py
similarity index 76%
rename from tests/tts_tests/test_helpers.py
rename to tests/aux_tests/test_helpers.py
index d07efa3620..6781cbc5d4 100644
--- a/tests/tts_tests/test_helpers.py
+++ b/tests/aux_tests/test_helpers.py
@@ -1,6 +1,14 @@
 import torch as T
 
-from TTS.tts.utils.helpers import average_over_durations, generate_path, rand_segments, segment, sequence_mask
+from TTS.tts.utils.helpers import (
+    average_over_durations,
+    expand_encoder_outputs,
+    generate_attention,
+    generate_path,
+    rand_segments,
+    segment,
+    sequence_mask,
+)
 
 
 def test_average_over_durations():  # pylint: disable=no-self-use
@@ -86,3 +94,24 @@ def test_generate_path():
             assert all(path[b, t, :current_idx] == 0.0)
             assert all(path[b, t, current_idx + durations[b, t].item() :] == 0.0)
             current_idx += durations[b, t].item()
+
+    assert T.all(path == generate_attention(durations, x_mask, y_mask))
+    assert T.all(path == generate_attention(durations, x_mask))
+
+
+def test_expand_encoder_outputs():
+    inputs = T.rand(2, 5, 57)
+    durations = T.randint(1, 4, (2, 57))
+
+    x_mask = T.ones(2, 1, 57)
+    y_lengths = T.ones(2) * durations.sum(1).max()
+
+    expanded, _, _ = expand_encoder_outputs(inputs, durations, x_mask, y_lengths)
+
+    for b in range(durations.shape[0]):
+        index = 0
+        for idx, dur in enumerate(durations[b]):
+            idx_expanded = expanded[b, :, index : index + dur.item()]
+            diff = (idx_expanded - inputs[b, :, idx].repeat(int(dur)).view(idx_expanded.shape)).sum()
+            assert abs(diff) < 1e-6, diff
+            index += dur
diff --git a/tests/aux_tests/test_numpy_transforms.py b/tests/aux_tests/test_numpy_transforms.py
index 00597a0f88..129ba5d86b 100644
--- a/tests/aux_tests/test_numpy_transforms.py
+++ b/tests/aux_tests/test_numpy_transforms.py
@@ -7,18 +7,12 @@
 import numpy as np
 from coqpit import Coqpit
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+from tests import get_tests_input_path, get_tests_path
 from TTS.utils.audio import numpy_transforms as np_transforms
 
 TESTS_PATH = get_tests_path()
-OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
-os.makedirs(OUT_PATH, exist_ok=True)
-
-
-# pylint: disable=no-self-use
-
 
 class TestNumpyTransforms(unittest.TestCase):
     def setUp(self) -> None:
diff --git a/tests/aux_tests/test_server.py b/tests/aux_tests/test_server.py
new file mode 100644
index 0000000000..1b691f9596
--- /dev/null
+++ b/tests/aux_tests/test_server.py
@@ -0,0 +1,47 @@
+import os
+import signal
+import socket
+import subprocess
+import time
+import wave
+
+import pytest
+import requests
+
+PORT = 5003
+
+
+def wait_for_server(host, port, timeout=30):
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            with socket.create_connection((host, port), timeout=2):
+                return True
+        except (OSError, ConnectionRefusedError):
+            time.sleep(1)
+    raise TimeoutError(f"Server at {host}:{port} did not start within {timeout} seconds.")
+
+
+@pytest.fixture(scope="module", autouse=True)
+def start_flask_server():
+    server_process = subprocess.Popen(
+        ["python", "-m", "TTS.server.server", "--port", str(PORT)],
+    )
+    wait_for_server("localhost", PORT)
+    yield
+    os.kill(server_process.pid, signal.SIGTERM)
+    server_process.wait()
+
+
+def test_flask_server(tmp_path):
+    url = f"http://localhost:{PORT}/api/tts?text=synthesis%20schmynthesis"
+    response = requests.get(url)
+    assert response.status_code == 200, f"Request failed with status code {response.status_code}"
+
+    wav_path = tmp_path / "output.wav"
+    with wav_path.open("wb") as f:
+        f.write(response.content)
+
+    with wave.open(str(wav_path), "rb") as wav_file:
+        num_frames = wav_file.getnframes()
+        assert num_frames > 0, "WAV file contains no frames."
diff --git a/tests/aux_tests/test_speaker_encoder_train.py b/tests/aux_tests/test_speaker_encoder_train.py
deleted file mode 100644
index 5d8626faa6..0000000000
--- a/tests/aux_tests/test_speaker_encoder_train.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.config.shared_configs import BaseAudioConfig
-from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig
-
-
-def run_test_train():
-    command = (
-        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
-        f"--coqpit.output_path {output_path} "
-        "--coqpit.datasets.0.formatter ljspeech_test "
-        "--coqpit.datasets.0.meta_file_train metadata.csv "
-        "--coqpit.datasets.0.meta_file_val metadata.csv "
-        "--coqpit.datasets.0.path tests/data/ljspeech "
-    )
-    run_cli(command)
-
-
-config_path = os.path.join(get_tests_output_path(), "test_speaker_encoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = SpeakerEncoderConfig(
-    batch_size=4,
-    num_classes_in_batch=4,
-    num_utter_per_class=2,
-    eval_num_classes_in_batch=4,
-    eval_num_utter_per_class=2,
-    num_loader_workers=1,
-    epochs=1,
-    print_step=1,
-    save_step=2,
-    print_eval=True,
-    run_eval=True,
-    audio=BaseAudioConfig(num_mels=80),
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.loss = "ge2e"
-config.save_json(config_path)
-
-print(config)
-# train the model for one epoch
-run_test_train()
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
-
-# test resnet speaker encoder
-config.model_params["model_name"] = "resnet"
-config.save_json(config_path)
-
-# train the model for one epoch
-run_test_train()
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
-
-# test model with ge2e loss function
-# config.loss = "ge2e"
-# config.save_json(config_path)
-# run_test_train()
-
-# test model with angleproto loss function
-# config.loss = "angleproto"
-# config.save_json(config_path)
-# run_test_train()
-
-# test model with softmaxproto loss function
-config.loss = "softmaxproto"
-config.save_json(config_path)
-run_test_train()
diff --git a/tests/aux_tests/test_stft_torch.py b/tests/aux_tests/test_stft_torch.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/aux_tests/test_torch_transforms.py b/tests/aux_tests/test_torch_transforms.py
new file mode 100644
index 0000000000..2da5a359c1
--- /dev/null
+++ b/tests/aux_tests/test_torch_transforms.py
@@ -0,0 +1,16 @@
+import numpy as np
+import torch
+
+from TTS.utils.audio import numpy_transforms as np_transforms
+from TTS.utils.audio.torch_transforms import amp_to_db, db_to_amp
+
+
+def test_amplitude_db_conversion():
+    x = torch.rand(11)
+    o1 = amp_to_db(x=x, spec_gain=1.0)
+    o2 = db_to_amp(x=o1, spec_gain=1.0)
+    np_o1 = np_transforms.amp_to_db(x=x, base=np.e)
+    np_o2 = np_transforms.db_to_amp(x=np_o1, base=np.e)
+    assert torch.allclose(x, o2)
+    assert torch.allclose(o1, np_o1)
+    assert torch.allclose(o2, np_o2)
diff --git a/tests/bash_tests/test_compute_statistics.sh b/tests/bash_tests/test_compute_statistics.sh
deleted file mode 100755
index 721777f852..0000000000
--- a/tests/bash_tests/test_compute_statistics.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-set -xe
-BASEDIR=$(dirname "$0")
-echo "$BASEDIR"
-# run training
-CUDA_VISIBLE_DEVICES="" python TTS/bin/compute_statistics.py --config_path $BASEDIR/../inputs/test_glow_tts.json --out_path $BASEDIR/../outputs/scale_stats.npy
diff --git a/tests/bash_tests/test_demo_server.sh b/tests/bash_tests/test_demo_server.sh
deleted file mode 100755
index ebd0bc8b89..0000000000
--- a/tests/bash_tests/test_demo_server.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-set -xe
-
-python -m TTS.server.server &
-SERVER_PID=$!
-
-echo 'Waiting for server...'
-sleep 30
-
-curl -o /tmp/audio.wav "http://localhost:5002/api/tts?text=synthesis%20schmynthesis"
-python -c 'import sys; import wave; print(wave.open(sys.argv[1]).getnframes())' /tmp/audio.wav
-
-kill $SERVER_PID
-
-rm /tmp/audio.wav
diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py
index 252b429a16..975281c549 100644
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@@ -1,12 +1,12 @@
 import os
 import shutil
-import unittest
 
 import numpy as np
+import pytest
 import torch
 from torch.utils.data import DataLoader
 
-from tests import get_tests_data_path, get_tests_output_path
+from tests import get_tests_data_path
 from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.datasets.dataset import TTSDataset
@@ -15,9 +15,6 @@
 
 # pylint: disable=unused-variable
 
-OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
-os.makedirs(OUTPATH, exist_ok=True)
-
 # create a dummy config for testing data loaders.
 c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False)
 c.r = 5
@@ -47,210 +44,210 @@
 
 dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac]
 
+ap = AudioProcessor(**c.audio)
+max_loader_iter = 4
+
 DATA_EXIST = True
 if not os.path.exists(c.data_path):
     DATA_EXIST = False
 
-print(" > Dynamic data loader test: {}".format(DATA_EXIST))
-
-
-class TestTTSDataset(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.max_loader_iter = 4
-        self.ap = AudioProcessor(**c.audio)
-
-    def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False):
-        # load dataset
-        meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
-        items = meta_data_train + meta_data_eval
-        tokenizer, _ = TTSTokenizer.init_from_config(c)
-        dataset = TTSDataset(
-            outputs_per_step=r,
-            compute_linear_spec=True,
-            return_wav=True,
-            tokenizer=tokenizer,
-            ap=self.ap,
-            samples=items,
-            batch_group_size=bgs,
-            min_text_len=c.min_text_len,
-            max_text_len=c.max_text_len,
-            min_audio_len=c.min_audio_len,
-            max_audio_len=c.max_audio_len,
-            start_by_longest=start_by_longest,
-        )
-
-        # add preprocess to force the length computation
-        if preprocess_samples:
-            dataset.preprocess_samples()
-
-        dataloader = DataLoader(
-            dataset,
-            batch_size=batch_size,
-            shuffle=False,
-            collate_fn=dataset.collate_fn,
-            drop_last=True,
-            num_workers=c.num_loader_workers,
-        )
-        return dataloader, dataset
-
-    def test_loader(self):
-        for dataset_config in dataset_configs:
-            dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True)
-            for i, data in enumerate(dataloader):
-                if i == self.max_loader_iter:
-                    break
-                text_input = data["token_id"]
-                _ = data["token_id_lengths"]
-                speaker_name = data["speaker_names"]
-                linear_input = data["linear"]
-                mel_input = data["mel"]
-                mel_lengths = data["mel_lengths"]
-                _ = data["stop_targets"]
-                _ = data["item_idxs"]
-                wavs = data["waveform"]
-
-                neg_values = text_input[text_input < 0]
-                check_count = len(neg_values)
-
-                # check basic conditions
-                self.assertEqual(check_count, 0)
-                self.assertEqual(linear_input.shape[0], mel_input.shape[0], c.batch_size)
-                self.assertEqual(linear_input.shape[2], self.ap.fft_size // 2 + 1)
-                self.assertEqual(mel_input.shape[2], c.audio["num_mels"])
-                self.assertEqual(wavs.shape[1], mel_input.shape[1] * c.audio.hop_length)
-                self.assertIsInstance(speaker_name[0], str)
-
-                # make sure that the computed mels and the waveform match and correctly computed
-                mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy())
-                # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
-                mel_new = mel_new[:, : mel_lengths[0]]
-                ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
-                mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg]
-                self.assertLess(abs(mel_diff.sum()), 1e-5)
-
-                # check normalization ranges
-                if self.ap.symmetric_norm:
-                    self.assertLessEqual(mel_input.max(), self.ap.max_norm)
-                    self.assertGreaterEqual(
-                        mel_input.min(), -self.ap.max_norm  # pylint: disable=invalid-unary-operand-type
-                    )
-                    self.assertLess(mel_input.min(), 0)
-                else:
-                    self.assertLessEqual(mel_input.max(), self.ap.max_norm)
-                    self.assertGreaterEqual(mel_input.min(), 0)
-
-    def test_batch_group_shuffle(self):
-        dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav)
-        last_length = 0
-        frames = dataset.samples
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            mel_lengths = data["mel_lengths"]
-            avg_length = mel_lengths.numpy().mean()
-        dataloader.dataset.preprocess_samples()
-        is_items_reordered = False
-        for idx, item in enumerate(dataloader.dataset.samples):
-            if item != frames[idx]:
-                is_items_reordered = True
-                break
-        self.assertGreaterEqual(avg_length, last_length)
-        self.assertTrue(is_items_reordered)
-
-    def test_start_by_longest(self):
-        """Test start_by_longest option.
-
-        Ther first item of the fist batch must be longer than all the other items.
-        """
-        dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True)
-        dataloader.dataset.preprocess_samples()
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            mel_lengths = data["mel_lengths"]
-            if i == 0:
-                max_len = mel_lengths[0]
-            print(mel_lengths)
-            self.assertTrue(all(max_len >= mel_lengths))
-
-    def test_padding_and_spectrograms(self):
-        def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths):
-            self.assertNotEqual(linear_input[idx, -1].sum(), 0)  # check padding
-            self.assertNotEqual(linear_input[idx, -2].sum(), 0)
-            self.assertNotEqual(mel_input[idx, -1].sum(), 0)
-            self.assertNotEqual(mel_input[idx, -2].sum(), 0)
-            self.assertEqual(stop_target[idx, -1], 1)
-            self.assertEqual(stop_target[idx, -2], 0)
-            self.assertEqual(stop_target[idx].sum(), 1)
-            self.assertEqual(len(mel_lengths.shape), 1)
-            self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0])
-            self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0])
-
-        dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav)
-
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            linear_input = data["linear"]
-            mel_input = data["mel"]
-            mel_lengths = data["mel_lengths"]
-            stop_target = data["stop_targets"]
-            item_idx = data["item_idxs"]
-
-            # check mel_spec consistency
-            wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32)
-            mel = self.ap.melspectrogram(wav).astype("float32")
-            mel = torch.FloatTensor(mel).contiguous()
-            mel_dl = mel_input[0]
-            # NOTE: Below needs to check == 0 but due to an unknown reason
-            # there is a slight difference between two matrices.
-            # TODO: Check this assert cond more in detail.
-            self.assertLess(abs(mel.T - mel_dl).max(), 1e-5)
-
-            # check mel-spec correctness
-            mel_spec = mel_input[0].cpu().numpy()
-            wav = self.ap.inv_melspectrogram(mel_spec.T)
-            self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav")
-            shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav")
-
-            # check linear-spec
-            linear_spec = linear_input[0].cpu().numpy()
-            wav = self.ap.inv_spectrogram(linear_spec.T)
-            self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav")
-            shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav")
-
-            # check the outputs
-            check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)
-
-        # Test for batch size 2
-        dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav)
-
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            linear_input = data["linear"]
-            mel_input = data["mel"]
-            mel_lengths = data["mel_lengths"]
-            stop_target = data["stop_targets"]
-            item_idx = data["item_idxs"]
-
-            # set id to the longest sequence in the batch
-            if mel_lengths[0] > mel_lengths[1]:
-                idx = 0
-            else:
-                idx = 1
-
-            # check the longer item in the batch
-            check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths)
-
-            # check the other item in the batch
-            self.assertEqual(linear_input[1 - idx, -1].sum(), 0)
-            self.assertEqual(mel_input[1 - idx, -1].sum(), 0)
-            self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1)
-            self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1])
-            self.assertEqual(len(mel_lengths.shape), 1)
-
-            # check batch zero-frame conditions (zero-frame disabled)
-            # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
-            # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
+print(f" > Dynamic data loader test: {DATA_EXIST}")
+
+
+def _create_dataloader(batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False):
+    # load dataset
+    meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
+    items = meta_data_train + meta_data_eval
+    tokenizer, _ = TTSTokenizer.init_from_config(c)
+    dataset = TTSDataset(
+        outputs_per_step=r,
+        compute_linear_spec=True,
+        return_wav=True,
+        tokenizer=tokenizer,
+        ap=ap,
+        samples=items,
+        batch_group_size=bgs,
+        min_text_len=c.min_text_len,
+        max_text_len=c.max_text_len,
+        min_audio_len=c.min_audio_len,
+        max_audio_len=c.max_audio_len,
+        start_by_longest=start_by_longest,
+    )
+
+    # add preprocess to force the length computation
+    if preprocess_samples:
+        dataset.preprocess_samples()
+
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=dataset.collate_fn,
+        drop_last=True,
+        num_workers=c.num_loader_workers,
+    )
+    return dataloader, dataset
+
+
+@pytest.mark.parametrize("dataset_config", dataset_configs)
+def test_loader(dataset_config: BaseDatasetConfig):
+    batch_size = 1
+    dataloader, _ = _create_dataloader(batch_size, 1, 0, dataset_config, preprocess_samples=True)
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        text_input = data["token_id"]
+        _ = data["token_id_lengths"]
+        speaker_name = data["speaker_names"]
+        linear_input = data["linear"]
+        mel_input = data["mel"]
+        mel_lengths = data["mel_lengths"]
+        _ = data["stop_targets"]
+        _ = data["item_idxs"]
+        wavs = data["waveform"]
+
+        neg_values = text_input[text_input < 0]
+        check_count = len(neg_values)
+
+        # check basic conditions
+        assert check_count == 0
+        assert linear_input.shape[0] == mel_input.shape[0] == batch_size
+        assert linear_input.shape[2] == ap.fft_size // 2 + 1
+        assert mel_input.shape[2] == c.audio["num_mels"]
+        assert wavs.shape[1] == mel_input.shape[1] * c.audio.hop_length
+        assert isinstance(speaker_name[0], str)
+
+        # make sure that the computed mels and the waveform match and correctly computed
+        mel_new = ap.melspectrogram(wavs[0].squeeze().numpy())
+        # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
+        mel_new = mel_new[:, : mel_lengths[0]]
+        ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
+        mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg]
+        assert abs(mel_diff.sum()) < 1e-5
+
+        # check normalization ranges
+        if ap.symmetric_norm:
+            assert mel_input.max() <= ap.max_norm
+            assert mel_input.min() >= -ap.max_norm
+            assert mel_input.min() < 0
+        else:
+            assert mel_input.max() <= ap.max_norm
+            assert mel_input.min() >= 0
+
+
+def test_batch_group_shuffle():
+    dataloader, dataset = _create_dataloader(2, c.r, 16, dataset_config_wav)
+    last_length = 0
+    frames = dataset.samples
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        mel_lengths = data["mel_lengths"]
+        avg_length = mel_lengths.numpy().mean()
+    dataloader.dataset.preprocess_samples()
+    is_items_reordered = False
+    for idx, item in enumerate(dataloader.dataset.samples):
+        if item != frames[idx]:
+            is_items_reordered = True
+            break
+    assert avg_length >= last_length
+    assert is_items_reordered
+
+
+def test_start_by_longest():
+    """Test start_by_longest option.
+
+    The first item of the fist batch must be longer than all the other items.
+    """
+    dataloader, _ = _create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True)
+    dataloader.dataset.preprocess_samples()
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        mel_lengths = data["mel_lengths"]
+        if i == 0:
+            max_len = mel_lengths[0]
+        print(mel_lengths)
+        assert all(max_len >= mel_lengths)
+
+
+def test_padding_and_spectrograms(tmp_path):
+    def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths):
+        assert linear_input[idx, -1].sum() != 0  # check padding
+        assert linear_input[idx, -2].sum() != 0
+        assert mel_input[idx, -1].sum() != 0
+        assert mel_input[idx, -2].sum() != 0
+        assert stop_target[idx, -1] == 1
+        assert stop_target[idx, -2] == 0
+        assert stop_target[idx].sum() == 1
+        assert len(mel_lengths.shape) == 1
+        assert mel_lengths[idx] == linear_input[idx].shape[0]
+        assert mel_lengths[idx] == mel_input[idx].shape[0]
+
+    dataloader, _ = _create_dataloader(1, 1, 0, dataset_config_wav)
+
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        linear_input = data["linear"]
+        mel_input = data["mel"]
+        mel_lengths = data["mel_lengths"]
+        stop_target = data["stop_targets"]
+        item_idx = data["item_idxs"]
+
+        # check mel_spec consistency
+        wav = np.asarray(ap.load_wav(item_idx[0]), dtype=np.float32)
+        mel = ap.melspectrogram(wav).astype("float32")
+        mel = torch.FloatTensor(mel).contiguous()
+        mel_dl = mel_input[0]
+        # NOTE: Below needs to check == 0 but due to an unknown reason
+        # there is a slight difference between two matrices.
+        # TODO: Check this assert cond more in detail.
+        assert abs(mel.T - mel_dl).max() < 1e-5
+
+        # check mel-spec correctness
+        mel_spec = mel_input[0].cpu().numpy()
+        wav = ap.inv_melspectrogram(mel_spec.T)
+        ap.save_wav(wav, tmp_path / "mel_inv_dataloader.wav")
+        shutil.copy(item_idx[0], tmp_path / "mel_target_dataloader.wav")
+
+        # check linear-spec
+        linear_spec = linear_input[0].cpu().numpy()
+        wav = ap.inv_spectrogram(linear_spec.T)
+        ap.save_wav(wav, tmp_path / "linear_inv_dataloader.wav")
+        shutil.copy(item_idx[0], tmp_path / "linear_target_dataloader.wav")
+
+        # check the outputs
+        check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)
+
+    # Test for batch size 2
+    dataloader, _ = _create_dataloader(2, 1, 0, dataset_config_wav)
+
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        linear_input = data["linear"]
+        mel_input = data["mel"]
+        mel_lengths = data["mel_lengths"]
+        stop_target = data["stop_targets"]
+        item_idx = data["item_idxs"]
+
+        # set id to the longest sequence in the batch
+        if mel_lengths[0] > mel_lengths[1]:
+            idx = 0
+        else:
+            idx = 1
+
+        # check the longer item in the batch
+        check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths)
+
+        # check the other item in the batch
+        assert linear_input[1 - idx, -1].sum() == 0
+        assert mel_input[1 - idx, -1].sum() == 0
+        assert stop_target[1, mel_lengths[1] - 1] == 1
+        assert stop_target[1, mel_lengths[1] :].sum() == stop_target.shape[1] - mel_lengths[1]
+        assert len(mel_lengths.shape) == 1
+
+        # check batch zero-frame conditions (zero-frame disabled)
+        # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
+        # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py
index 28a4088c96..beb7df689b 100644
--- a/tests/inference_tests/test_synthesize.py
+++ b/tests/inference_tests/test_synthesize.py
@@ -1,20 +1,17 @@
-import os
+from tests import run_main
+from TTS.bin.synthesize import main
 
-from tests import get_tests_output_path, run_cli
 
-
-def test_synthesize():
+def test_synthesize(tmp_path):
     """Test synthesize.py with diffent arguments."""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    run_cli("tts --list_models")
+    output_path = str(tmp_path / "output.wav")
+
+    run_main(main, ["--list_models"])
 
     # single speaker model
-    run_cli(f'tts --text "This is an example." --out_path "{output_path}"')
-    run_cli(
-        "tts --model_name tts_models/en/ljspeech/glow-tts " f'--text "This is an example." --out_path "{output_path}"'
-    )
-    run_cli(
-        "tts --model_name tts_models/en/ljspeech/glow-tts  "
-        "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan "
-        f'--text "This is an example." --out_path "{output_path}"'
-    )
+    args = ["--text", "This is an example.", "--out_path", output_path]
+    run_main(main, args)
+
+    args = [*args, "--model_name", "tts_models/en/ljspeech/glow-tts"]
+    run_main(main, args)
+    run_main(main, [*args, "--vocoder_name", "vocoder_models/en/ljspeech/multiband-melgan"])
diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py
index ce4fc751c2..21cc194131 100644
--- a/tests/inference_tests/test_synthesizer.py
+++ b/tests/inference_tests/test_synthesizer.py
@@ -23,7 +23,7 @@ def test_in_out(self):
         tts_root_path = get_tests_input_path()
         tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth")
         tts_config = os.path.join(tts_root_path, "dummy_model_config.json")
-        synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None)
+        synthesizer = Synthesizer(tts_checkpoint=tts_checkpoint, tts_config_path=tts_config)
         synthesizer.tts("Better this test works!!")
 
     def test_split_into_sentences(self):
diff --git a/tests/inputs/test_align_tts.json b/tests/inputs/test_align_tts_config.json
similarity index 100%
rename from tests/inputs/test_align_tts.json
rename to tests/inputs/test_align_tts_config.json
diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts_config.json
similarity index 100%
rename from tests/inputs/test_glow_tts.json
rename to tests/inputs/test_glow_tts_config.json
diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech_config.json
similarity index 100%
rename from tests/inputs/test_speedy_speech.json
rename to tests/inputs/test_speedy_speech_config.json
diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad_config.json
similarity index 100%
rename from tests/inputs/test_vocoder_wavegrad.json
rename to tests/inputs/test_vocoder_wavegrad_config.json
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 0000000000..bd872c5b44
--- /dev/null
+++ b/tests/integration/__init__.py
@@ -0,0 +1,128 @@
+import json
+import shutil
+from pathlib import Path
+from typing import Any, TypeVar, Union
+
+import torch
+from trainer.io import get_last_checkpoint
+
+from tests import run_main
+from TTS.bin.synthesize import main as synthesize
+from TTS.bin.train_tts import main as train_tts
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.vc.configs.shared_configs import BaseVCConfig
+
+TEST_TTS_CONFIG = {
+    "batch_size": 8,
+    "eval_batch_size": 8,
+    "num_loader_workers": 0,
+    "num_eval_loader_workers": 0,
+    "text_cleaner": "english_cleaners",
+    "use_phonemes": True,
+    "phoneme_language": "en-us",
+    "run_eval": True,
+    "test_delay_epochs": -1,
+    "epochs": 1,
+    "print_step": 1,
+    "print_eval": True,
+    "test_sentences": ["Be a voice, not an echo."],
+}
+
+TEST_VC_CONFIG = {
+    "batch_size": 8,
+    "eval_batch_size": 8,
+    "num_loader_workers": 0,
+    "num_eval_loader_workers": 0,
+    "run_eval": True,
+    "test_delay_epochs": -1,
+    "epochs": 1,
+    "seq_len": 8192,
+    "eval_split_size": 1,
+    "print_step": 1,
+    "print_eval": True,
+    "data_path": "tests/data/ljspeech",
+}
+
+Config = TypeVar("Config", BaseTTSConfig, BaseVCConfig)
+
+
+def create_config(config_class: type[Config], **overrides: Any) -> Config:
+    base_config = TEST_TTS_CONFIG if issubclass(config_class, BaseTTSConfig) else TEST_VC_CONFIG
+    params = {**base_config, **overrides}
+    return config_class(**params)
+
+
+def run_tts_train(tmp_path: Path, config: BaseTTSConfig):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    # For NeuralHMM and Overflow
+    parameter_path = tmp_path / "lj_parameters.pt"
+    torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path)
+    config.mel_statistics_parameter_path = parameter_path
+
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    is_multi_speaker = config.use_speaker_embedding or config.use_d_vector_file
+    formatter = "ljspeech_test" if is_multi_speaker else "ljspeech"
+    command_train = [
+        "--config_path",
+        str(config_path),
+        "--coqpit.output_path",
+        str(output_path),
+        "--coqpit.phoneme_cache_path",
+        str(output_path / "phoneme_cache"),
+        "--coqpit.datasets.0.formatter",
+        formatter,
+        "--coqpit.datasets.0.meta_file_train",
+        "metadata.csv",
+        "--coqpit.datasets.0.meta_file_val",
+        "metadata.csv",
+        "--coqpit.datasets.0.path",
+        "tests/data/ljspeech",
+        "--coqpit.test_delay_epochs",
+        "0",
+        "--coqpit.datasets.0.meta_file_attn_mask",
+        "tests/data/ljspeech/metadata_attn_mask.txt",
+    ]
+    run_main(train_tts, command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
+
+    inference_command = [
+        "--text",
+        "This is an example for the tests.",
+        "--config_path",
+        str(continue_config_path),
+        "--model_path",
+        str(continue_restore_path),
+        "--out_path",
+        str(out_wav_path),
+    ]
+    if config.use_speaker_embedding:
+        continue_speakers_path = continue_path / "speakers.json"
+    elif config.use_d_vector_file:
+        continue_speakers_path = config.d_vector_file
+    if is_multi_speaker:
+        inference_command.extend(["--speaker_idx", "ljspeech-1", "--speakers_file_path", str(continue_speakers_path)])
+    run_main(synthesize, inference_command)
+
+    # restore the model and continue training for one more epoch
+    run_main(train_tts, ["--continue_path", str(continue_path)])
+    shutil.rmtree(tmp_path)
diff --git a/tests/integration/test_speaker_encoder_train.py b/tests/integration/test_speaker_encoder_train.py
new file mode 100644
index 0000000000..ce817680b7
--- /dev/null
+++ b/tests/integration/test_speaker_encoder_train.py
@@ -0,0 +1,87 @@
+import shutil
+
+from tests import run_main
+from TTS.bin.train_encoder import main
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig
+
+
+def test_train(tmp_path):
+    config_path = tmp_path / "test_speaker_encoder_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    def run_test_train():
+        command = [
+            "--config_path",
+            str(config_path),
+            "--coqpit.output_path",
+            str(output_path),
+            "--coqpit.datasets.0.formatter",
+            "ljspeech_test",
+            "--coqpit.datasets.0.meta_file_train",
+            "metadata.csv",
+            "--coqpit.datasets.0.meta_file_val",
+            "metadata.csv",
+            "--coqpit.datasets.0.path",
+            "tests/data/ljspeech",
+        ]
+        run_main(main, command)
+
+    config = SpeakerEncoderConfig(
+        batch_size=4,
+        num_classes_in_batch=4,
+        num_utter_per_class=2,
+        eval_num_classes_in_batch=4,
+        eval_num_utter_per_class=2,
+        num_loader_workers=1,
+        epochs=1,
+        print_step=1,
+        save_step=2,
+        print_eval=True,
+        run_eval=True,
+        audio=BaseAudioConfig(num_mels=80),
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.loss = "ge2e"
+    config.save_json(config_path)
+
+    print(config)
+    # train the model for one epoch
+    run_test_train()
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # restore the model and continue training for one more epoch
+    run_main(main, ["--continue_path", str(continue_path)])
+    shutil.rmtree(continue_path)
+
+    # test resnet speaker encoder
+    config.model_params["model_name"] = "resnet"
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    run_test_train()
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # restore the model and continue training for one more epoch
+    run_main(main, ["--continue_path", str(continue_path)])
+    shutil.rmtree(continue_path)
+
+    # test model with ge2e loss function
+    # config.loss = "ge2e"
+    # config.save_json(config_path)
+    # run_test_train()
+
+    # test model with angleproto loss function
+    # config.loss = "angleproto"
+    # config.save_json(config_path)
+    # run_test_train()
+
+    # test model with softmaxproto loss function
+    config.loss = "softmaxproto"
+    config.save_json(config_path)
+    run_test_train()
diff --git a/tests/integration/test_train_tts.py b/tests/integration/test_train_tts.py
new file mode 100644
index 0000000000..d1e35ae450
--- /dev/null
+++ b/tests/integration/test_train_tts.py
@@ -0,0 +1,109 @@
+import pytest
+
+from tests.integration import create_config, run_tts_train
+from TTS.tts.configs.align_tts_config import AlignTTSConfig
+from TTS.tts.configs.delightful_tts_config import DelightfulTTSConfig
+from TTS.tts.configs.fast_pitch_config import FastPitchConfig
+from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
+from TTS.tts.configs.glow_tts_config import GlowTTSConfig
+from TTS.tts.configs.neuralhmm_tts_config import NeuralhmmTTSConfig
+from TTS.tts.configs.overflow_config import OverflowConfig
+from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
+from TTS.tts.configs.tacotron2_config import Tacotron2Config
+from TTS.tts.configs.tacotron_config import TacotronConfig
+from TTS.tts.configs.vits_config import VitsConfig
+
+SPEAKER_ARGS = (
+    {},
+    {
+        "use_d_vector_file": True,
+        "d_vector_file": "tests/data/ljspeech/speakers.json",
+        "d_vector_dim": 256,
+    },
+    {
+        "use_speaker_embedding": True,
+        "num_speakers": 4,
+    },
+)
+SPEAKER_ARG_IDS = ["single", "dvector", "speaker_emb"]
+
+
+def test_train_align_tts(tmp_path):
+    config = create_config(AlignTTSConfig, use_phonemes=False)
+    run_tts_train(tmp_path, config)
+
+
+@pytest.mark.parametrize("speaker_args", SPEAKER_ARGS, ids=SPEAKER_ARG_IDS)
+def test_train_delightful_tts(tmp_path, speaker_args):
+    config = create_config(
+        DelightfulTTSConfig,
+        batch_size=2,
+        f0_cache_path=tmp_path / "f0_cache",  # delightful f0 cache is incompatible with other models
+        binary_align_loss_alpha=0.0,
+        use_attn_priors=False,
+        **speaker_args,
+    )
+    run_tts_train(tmp_path, config)
+
+
+@pytest.mark.parametrize("speaker_args", SPEAKER_ARGS, ids=SPEAKER_ARG_IDS)
+def test_train_fast_pitch(tmp_path, speaker_args):
+    config = create_config(FastPitchConfig, f0_cache_path="tests/data/ljspeech/f0_cache", **speaker_args)
+    config.audio.signal_norm = False
+    config.audio.mel_fmax = 8000
+    config.audio.spec_gain = 1
+    config.audio.log_func = "np.log"
+    run_tts_train(tmp_path, config)
+
+
+@pytest.mark.parametrize("speaker_args", SPEAKER_ARGS, ids=SPEAKER_ARG_IDS)
+def test_train_fast_speech2(tmp_path, speaker_args):
+    config = create_config(
+        Fastspeech2Config,
+        f0_cache_path="tests/data/ljspeech/f0_cache",
+        energy_cache_path=tmp_path / "energy_cache",
+        **speaker_args,
+    )
+    config.audio.signal_norm = False
+    config.audio.mel_fmax = 8000
+    config.audio.spec_gain = 1
+    config.audio.log_func = "np.log"
+    run_tts_train(tmp_path, config)
+
+
+@pytest.mark.parametrize("speaker_args", SPEAKER_ARGS, ids=SPEAKER_ARG_IDS)
+def test_train_glow_tts(tmp_path, speaker_args):
+    config = create_config(GlowTTSConfig, batch_size=2, data_dep_init_steps=1, **speaker_args)
+    run_tts_train(tmp_path, config)
+
+
+def test_train_neuralhmm(tmp_path):
+    config = create_config(NeuralhmmTTSConfig, batch_size=3, eval_batch_size=3, max_sampling_time=50)
+    run_tts_train(tmp_path, config)
+
+
+def test_train_overflow(tmp_path):
+    config = create_config(OverflowConfig, batch_size=3, eval_batch_size=3, max_sampling_time=50)
+    run_tts_train(tmp_path, config)
+
+
+def test_train_speedy_speech(tmp_path):
+    config = create_config(SpeedySpeechConfig)
+    run_tts_train(tmp_path, config)
+
+
+def test_train_tacotron(tmp_path):
+    config = create_config(TacotronConfig, use_phonemes=False, r=5, max_decoder_steps=50)
+    run_tts_train(tmp_path, config)
+
+
+@pytest.mark.parametrize("speaker_args", SPEAKER_ARGS, ids=SPEAKER_ARG_IDS)
+def test_train_tacotron2(tmp_path, speaker_args):
+    config = create_config(Tacotron2Config, use_phonemes=False, r=5, max_decoder_steps=50, **speaker_args)
+    run_tts_train(tmp_path, config)
+
+
+@pytest.mark.parametrize("speaker_args", SPEAKER_ARGS, ids=SPEAKER_ARG_IDS)
+def test_train_vits(tmp_path, speaker_args):
+    config = create_config(VitsConfig, batch_size=2, eval_batch_size=2, **speaker_args)
+    run_tts_train(tmp_path, config)
diff --git a/tests/integration/test_train_vocoder.py b/tests/integration/test_train_vocoder.py
new file mode 100644
index 0000000000..8965de01ee
--- /dev/null
+++ b/tests/integration/test_train_vocoder.py
@@ -0,0 +1,112 @@
+import glob
+import os
+
+import pytest
+
+from tests import run_main
+from TTS.bin.train_vocoder import main
+from TTS.vocoder.configs import (
+    FullbandMelganConfig,
+    HifiganConfig,
+    MelganConfig,
+    MultibandMelganConfig,
+    ParallelWaveganConfig,
+    WavegradConfig,
+    WavernnConfig,
+)
+from TTS.vocoder.models.wavernn import WavernnArgs
+
+GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
+
+BASE_CONFIG = {
+    "batch_size": 8,
+    "eval_batch_size": 8,
+    "num_loader_workers": 0,
+    "num_eval_loader_workers": 0,
+    "run_eval": True,
+    "test_delay_epochs": -1,
+    "epochs": 1,
+    "seq_len": 8192,
+    "eval_split_size": 1,
+    "print_step": 1,
+    "print_eval": True,
+    "data_path": "tests/data/ljspeech",
+}
+
+DISCRIMINATOR_MODEL_PARAMS = {
+    "base_channels": 16,
+    "max_channels": 64,
+    "downsample_factors": [4, 4, 4],
+}
+
+
+def create_config(config_class, **overrides):
+    params = {**BASE_CONFIG, **overrides}
+    return config_class(**params)
+
+
+def run_train(tmp_path, config):
+    config_path = str(tmp_path / "test_vocoder_config.json")
+    output_path = tmp_path / "train_outputs"
+    config.output_path = output_path
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
+
+    # Train the model for one epoch
+    run_main(main, ["--config_path", config_path])
+
+    # Find the latest folder
+    continue_path = str(max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime))
+
+    # Restore the model and continue training for one more epoch
+    run_main(main, ["--continue_path", continue_path])
+
+
+def test_train_hifigan(tmp_path):
+    config = create_config(HifiganConfig, seq_len=1024)
+    run_train(tmp_path, config)
+
+
+def test_train_melgan(tmp_path):
+    config = create_config(
+        MelganConfig,
+        batch_size=4,
+        eval_batch_size=4,
+        seq_len=2048,
+        discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS,
+    )
+    run_train(tmp_path, config)
+
+
+def test_train_multiband_melgan(tmp_path):
+    config = create_config(
+        MultibandMelganConfig, steps_to_start_discriminator=1, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS
+    )
+    run_train(tmp_path, config)
+
+
+def test_train_fullband_melgan(tmp_path):
+    config = create_config(FullbandMelganConfig, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS)
+    run_train(tmp_path, config)
+
+
+def test_train_parallel_wavegan(tmp_path):
+    config = create_config(ParallelWaveganConfig, batch_size=4, eval_batch_size=4, seq_len=2048)
+    run_train(tmp_path, config)
+
+
+# TODO: Reactivate after improving CI run times
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Takes ~2h on CI (15min/step vs 8sec/step locally)")
+def test_train_wavegrad(tmp_path):
+    config = create_config(WavegradConfig, test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2})
+    run_train(tmp_path, config)
+
+
+def test_train_wavernn(tmp_path):
+    config = create_config(
+        WavernnConfig,
+        model_args=WavernnArgs(),
+        seq_len=256,  # For shorter test time
+    )
+    run_train(tmp_path, config)
diff --git a/tests/integration/test_vits_multilingual_speaker_emb_train.py b/tests/integration/test_vits_multilingual_speaker_emb_train.py
new file mode 100644
index 0000000000..9b095935de
--- /dev/null
+++ b/tests/integration/test_vits_multilingual_speaker_emb_train.py
@@ -0,0 +1,130 @@
+import json
+import shutil
+
+from trainer.io import get_last_checkpoint
+
+from tests import run_main
+from TTS.bin.synthesize import main as synthesize
+from TTS.bin.train_tts import main as train_tts
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.configs.vits_config import VitsConfig
+
+
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    dataset_config_en = BaseDatasetConfig(
+        formatter="ljspeech",
+        meta_file_train="metadata.csv",
+        meta_file_val="metadata.csv",
+        path="tests/data/ljspeech",
+        language="en",
+    )
+
+    dataset_config_pt = BaseDatasetConfig(
+        formatter="ljspeech",
+        meta_file_train="metadata.csv",
+        meta_file_val="metadata.csv",
+        path="tests/data/ljspeech",
+        language="pt-br",
+    )
+
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech", None, "en"],
+            ["Be a voice, not an echo.", "ljspeech", None, "pt-br"],
+        ],
+        datasets=[dataset_config_en, dataset_config_pt],
+    )
+    # set audio config
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+
+    # active multilingual mode
+    config.model_args.use_language_embedding = True
+    config.use_language_embedding = True
+    # active multispeaker mode
+    config.model_args.use_speaker_embedding = True
+    config.use_speaker_embedding = True
+
+    # deactivate multispeaker d-vec mode
+    config.model_args.use_d_vector_file = False
+    config.use_d_vector_file = False
+
+    # duration predictor
+    config.model_args.use_sdp = False
+    config.use_sdp = False
+
+    # active language sampler
+    config.use_language_weighted_sampler = True
+
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    command_train = [
+        "--config_path",
+        str(config_path),
+        "--coqpit.output_path",
+        str(output_path),
+        "--coqpit.test_delay_epochs",
+        "0",
+    ]
+    run_main(train_tts, command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech"
+    language_id = "en"
+    continue_speakers_path = continue_path / "speakers.json"
+    continue_languages_path = continue_path / "language_ids.json"
+
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
+
+    # Load the model and run inference
+    inference_command = [
+        "--text",
+        "This is an example for the tests.",
+        "--speaker_idx",
+        speaker_id,
+        "--language_idx",
+        language_id,
+        "--speakers_file_path",
+        str(continue_speakers_path),
+        "--language_ids_file_path",
+        str(continue_languages_path),
+        "--config_path",
+        str(continue_config_path),
+        "--model_path",
+        str(continue_restore_path),
+        "--out_path",
+        str(out_wav_path),
+    ]
+    run_main(synthesize, inference_command)
+
+    # restore the model and continue training for one more epoch
+    run_main(train_tts, ["--continue_path", str(continue_path)])
+    shutil.rmtree(tmp_path)
diff --git a/tests/integration/test_vits_multilingual_train-d_vectors.py b/tests/integration/test_vits_multilingual_train-d_vectors.py
new file mode 100644
index 0000000000..de0f6ed2b9
--- /dev/null
+++ b/tests/integration/test_vits_multilingual_train-d_vectors.py
@@ -0,0 +1,136 @@
+import json
+import shutil
+
+from trainer.io import get_last_checkpoint
+
+from tests import run_main
+from TTS.bin.synthesize import main as synthesize
+from TTS.bin.train_tts import main as train_tts
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.configs.vits_config import VitsConfig
+
+
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    dataset_config_en = BaseDatasetConfig(
+        formatter="ljspeech_test",
+        meta_file_train="metadata.csv",
+        meta_file_val="metadata.csv",
+        path="tests/data/ljspeech",
+        language="en",
+    )
+
+    dataset_config_pt = BaseDatasetConfig(
+        formatter="ljspeech_test",
+        meta_file_train="metadata.csv",
+        meta_file_val="metadata.csv",
+        path="tests/data/ljspeech",
+        language="pt-br",
+    )
+
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="multilingual_cleaners",
+        use_phonemes=False,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech-0", None, "en"],
+            ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"],
+        ],
+        datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt],
+    )
+    # set audio config
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+
+    # active multilingual mode
+    config.model_args.use_language_embedding = True
+    config.use_language_embedding = True
+
+    # deactivate multispeaker mode
+    config.model_args.use_speaker_embedding = False
+    config.use_speaker_embedding = False
+
+    # active multispeaker d-vec mode
+    config.model_args.use_d_vector_file = True
+    config.use_d_vector_file = True
+    config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
+    config.d_vector_file = ["tests/data/ljspeech/speakers.json"]
+    config.model_args.d_vector_dim = 256
+    config.d_vector_dim = 256
+
+    # duration predictor
+    config.model_args.use_sdp = True
+    config.use_sdp = True
+
+    # activate language and speaker samplers
+    config.use_language_weighted_sampler = True
+    config.language_weighted_sampler_alpha = 10
+    config.use_speaker_weighted_sampler = True
+    config.speaker_weighted_sampler_alpha = 5
+
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    command_train = [
+        "--config_path",
+        str(config_path),
+        "--coqpit.output_path",
+        str(output_path),
+        "--coqpit.test_delay_epochs",
+        "0",
+    ]
+    run_main(train_tts, command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    language_id = "en"
+    continue_speakers_path = config.d_vector_file
+    continue_languages_path = continue_path / "language_ids.json"
+
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
+
+    # Load the model and run inference
+    inference_command = [
+        "--text",
+        "This is an example for the tests.",
+        "--speaker_idx",
+        speaker_id,
+        "--language_idx",
+        language_id,
+        "--speakers_file_path",
+        str(continue_speakers_path),
+        "--language_ids_file_path",
+        str(continue_languages_path),
+        "--config_path",
+        str(continue_config_path),
+        "--model_path",
+        str(continue_restore_path),
+        "--out_path",
+        str(out_wav_path),
+    ]
+    run_main(synthesize, inference_command)
+
+    # restore the model and continue training for one more epoch
+    run_main(train_tts, ["--continue_path", str(continue_path)])
+    shutil.rmtree(tmp_path)
diff --git a/tests/integration/test_xtts_gpt_train.py b/tests/integration/test_xtts_gpt_train.py
new file mode 100644
index 0000000000..4d22b8102f
--- /dev/null
+++ b/tests/integration/test_xtts_gpt_train.py
@@ -0,0 +1,158 @@
+from pathlib import Path
+
+import pytest
+import torch
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.layers.xtts.dvae import DiscreteVAE
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
+from TTS.tts.models.xtts import XttsAudioConfig
+
+config_dataset = BaseDatasetConfig(
+    formatter="ljspeech",
+    dataset_name="ljspeech",
+    path="tests/data/ljspeech/",
+    meta_file_train="metadata.csv",
+    meta_file_val="metadata.csv",
+    language="en",
+)
+
+DATASETS_CONFIG_LIST = [config_dataset]
+
+# Logging parameters
+RUN_NAME = "GPT_XTTS_LJSpeech_FT"
+PROJECT_NAME = "XTTS_trainer"
+DASHBOARD_LOGGER = "tensorboard"
+LOGGER_URI = None
+
+# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
+TOKENIZER_FILE = "tests/inputs/xtts_vocab.json"  # vocab.json file
+XTTS_CHECKPOINT = None  # model.pth file
+
+# Training sentences generations
+SPEAKER_REFERENCE = [
+    "tests/data/ljspeech/wavs/LJ001-0002.wav"
+]  # speaker reference to be used in training test sentences
+LANGUAGE = config_dataset.language
+
+# Training Parameters
+OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
+START_WITH_EVAL = False  # if True it will star with evaluation
+BATCH_SIZE = 2  # set here the batch size
+GRAD_ACUMM_STEPS = 1  # set here the grad accumulation steps
+# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252
+# for more efficient training. You can increase/decrease BATCH_SIZE but then set
+# GRAD_ACUMM_STEPS accordingly.
+
+audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
+
+
+@pytest.mark.parametrize("use_perceiver", [False, True])
+def test_xtts_gpt_train(tmp_path: Path, use_perceiver: bool):
+    # Create DVAE checkpoint and mel_norms on test time
+    # DVAE parameters: For the training we need the dvae to extract the dvae tokens,
+    #                  given that you must provide the paths for this model
+    DVAE_CHECKPOINT = tmp_path / "dvae.pth"
+    # Mel spectrogram norms for dvae mel spectrogram extraction
+    MEL_NORM_FILE = tmp_path / "mel_stats.pth"
+    dvae = DiscreteVAE(
+        channels=80,
+        normalization=None,
+        positional_dims=1,
+        num_tokens=8192,
+        codebook_dim=512,
+        hidden_dim=512,
+        num_resnet_blocks=3,
+        kernel_size=3,
+        num_layers=2,
+        use_transposed_convs=False,
+    )
+    torch.save(dvae.state_dict(), DVAE_CHECKPOINT)
+    mel_stats = torch.ones(80)
+    torch.save(mel_stats, MEL_NORM_FILE)
+
+    # init args and config
+    model_args = GPTArgs(
+        max_conditioning_length=132300,  # 6 secs
+        min_conditioning_length=66150,  # 3 secs
+        debug_loading_failures=False,
+        max_wav_length=255995,  # ~11.6 seconds
+        max_text_length=200,
+        mel_norm_file=MEL_NORM_FILE,
+        dvae_checkpoint=DVAE_CHECKPOINT,
+        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
+        tokenizer_file=TOKENIZER_FILE,
+        gpt_num_audio_tokens=8194,
+        gpt_start_audio_token=8192,
+        gpt_stop_audio_token=8193,
+        gpt_use_perceiver_resampler=use_perceiver,
+    )
+
+    config = GPTTrainerConfig(
+        epochs=1,
+        output_path=tmp_path,
+        model_args=model_args,
+        run_name=RUN_NAME,
+        project_name=PROJECT_NAME,
+        run_description="GPT XTTS training",
+        dashboard_logger=DASHBOARD_LOGGER,
+        logger_uri=LOGGER_URI,
+        audio=audio_config,
+        batch_size=BATCH_SIZE,
+        batch_group_size=48,
+        eval_batch_size=BATCH_SIZE,
+        num_loader_workers=8,
+        eval_split_max_size=256,
+        print_step=50,
+        plot_step=100,
+        log_model_step=1000,
+        save_step=10000,
+        save_n_checkpoints=1,
+        save_checkpoints=True,
+        # target_loss="loss",
+        print_eval=False,
+        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
+        optimizer="AdamW",
+        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
+        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
+        lr=5e-06,  # learning rate
+        lr_scheduler="MultiStepLR",
+        # it was adjusted accordly for the new step scheme
+        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
+        test_sentences=[
+            {
+                "text": "This cake is great. It's so delicious and moist.",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+        ],
+    )
+
+    # init the model from config
+    model = GPTTrainer.init_from_config(config)
+
+    # load training samples
+    train_samples, eval_samples = load_tts_samples(
+        DATASETS_CONFIG_LIST,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
+    )
+
+    # init the trainer and 🚀
+    trainer = Trainer(
+        TrainerArgs(
+            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
+            skip_train_epoch=False,
+            start_with_eval=True,
+            grad_accum_steps=GRAD_ACUMM_STEPS,
+        ),
+        config,
+        output_path=tmp_path,
+        model=model,
+        train_samples=train_samples,
+        eval_samples=eval_samples,
+    )
+    trainer.fit()
diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py
index f9067530e6..370a541b97 100644
--- a/tests/text_tests/test_phonemizer.py
+++ b/tests/text_tests/test_phonemizer.py
@@ -240,12 +240,8 @@ def test_is_available(self):
 class TestBN_Phonemizer(unittest.TestCase):
     def setUp(self):
         self.phonemizer = BN_Phonemizer()
-        self._TEST_CASES = (
-            "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন"
-        )
-        self._EXPECTED = (
-            "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।"
-        )
+        self._TEST_CASES = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন"
+        self._EXPECTED = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।"
 
     def test_phonemize(self):
         self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED)
diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py
index 9be1f0bf41..f5d342bb00 100644
--- a/tests/text_tests/test_text_cleaners.py
+++ b/tests/text_tests/test_text_cleaners.py
@@ -24,6 +24,8 @@ def test_currency() -> None:
 def test_expand_numbers() -> None:
     assert phoneme_cleaners("-1") == "minus one"
     assert phoneme_cleaners("1") == "one"
+    assert phoneme_cleaners("1" + "0" * 35) == "one hundred decillion"
+    assert phoneme_cleaners("1" + "0" * 36) == "one" + " zero" * 36
 
 
 def test_multilingual_phoneme_cleaners() -> None:
@@ -43,11 +45,11 @@ def test_normalize_unicode() -> None:
         ("na\u0303", "nã"),
         ("o\u0302u", "ôu"),
         ("n\u0303", "ñ"),
-        ("\u4E2D\u56FD", "中国"),
+        ("\u4e2d\u56fd", "中国"),
         ("niño", "niño"),
         ("a\u0308", "ä"),
         ("\u3053\u3093\u306b\u3061\u306f", "こんにちは"),
-        ("\u03B1\u03B2", "αβ"),
+        ("\u03b1\u03b2", "αβ"),
     ]
     for arg, expect in test_cases:
         assert normalize_unicode(arg) == expect
diff --git a/tests/tts_tests2/test_delightful_tts_layers.py b/tests/tts_tests/test_delightful_tts_layers.py
similarity index 100%
rename from tests/tts_tests2/test_delightful_tts_layers.py
rename to tests/tts_tests/test_delightful_tts_layers.py
diff --git a/tests/tts_tests2/test_feed_forward_layers.py b/tests/tts_tests/test_feed_forward_layers.py
similarity index 100%
rename from tests/tts_tests2/test_feed_forward_layers.py
rename to tests/tts_tests/test_feed_forward_layers.py
diff --git a/tests/tts_tests2/test_forward_tts.py b/tests/tts_tests/test_forward_tts.py
similarity index 87%
rename from tests/tts_tests2/test_forward_tts.py
rename to tests/tts_tests/test_forward_tts.py
index cec0f211c8..13a2c270af 100644
--- a/tests/tts_tests2/test_forward_tts.py
+++ b/tests/tts_tests/test_forward_tts.py
@@ -6,29 +6,7 @@
 # pylint: disable=unused-variable
 
 
-def expand_encoder_outputs_test():
-    model = ForwardTTS(ForwardTTSArgs(num_chars=10))
-
-    inputs = T.rand(2, 5, 57)
-    durations = T.randint(1, 4, (2, 57))
-
-    x_mask = T.ones(2, 1, 57)
-    y_mask = T.ones(2, 1, durations.sum(1).max())
-
-    expanded, _ = model.expand_encoder_outputs(inputs, durations, x_mask, y_mask)
-
-    for b in range(durations.shape[0]):
-        index = 0
-        for idx, dur in enumerate(durations[b]):
-            diff = (
-                expanded[b, :, index : index + dur.item()]
-                - inputs[b, :, idx].repeat(dur.item()).view(expanded[b, :, index : index + dur.item()].shape)
-            ).sum()
-            assert abs(diff) < 1e-6, diff
-            index += dur
-
-
-def model_input_output_test():
+def test_model_input_output():
     """Assert the output shapes of the model in different modes"""
 
     # VANILLA MODEL
diff --git a/tests/tts_tests2/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py
similarity index 95%
rename from tests/tts_tests2/test_glow_tts.py
rename to tests/tts_tests/test_glow_tts.py
index 3c7ac51556..c92063576f 100644
--- a/tests/tts_tests2/test_glow_tts.py
+++ b/tests/tts_tests/test_glow_tts.py
@@ -42,8 +42,8 @@ def _create_inputs(batch_size=8):
     def _check_parameter_changes(model, model_ref):
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -107,7 +107,7 @@ def _test_forward(self, batch_size):
         config = GlowTTSConfig(num_chars=32)
         model = GlowTTS(config).to(device)
         model.train()
-        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for GlowTTS model:{count_parameters(model)}")
         # inference encoder and decoder with MAS
         y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths)
         self.assertEqual(y["z"].shape, mel_spec.shape)
@@ -134,7 +134,7 @@ def _test_forward_with_d_vector(self, batch_size):
         )
         model = GlowTTS.init_from_config(config).to(device)
         model.train()
-        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for GlowTTS model:{count_parameters(model)}")
         # inference encoder and decoder with MAS
         y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"d_vectors": d_vector})
         self.assertEqual(y["z"].shape, mel_spec.shape)
@@ -160,7 +160,7 @@ def _test_forward_with_speaker_id(self, batch_size):
         )
         model = GlowTTS.init_from_config(config).to(device)
         model.train()
-        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for GlowTTS model:{count_parameters(model)}")
         # inference encoder and decoder with MAS
         y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"speaker_ids": speaker_ids})
         self.assertEqual(y["z"].shape, mel_spec.shape)
@@ -241,10 +241,10 @@ def _test_inference_with_MAS(self, batch_size):
         # inference encoder and decoder with MAS
         y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths)
         y2 = model.decoder_inference(mel_spec, mel_lengths)
-        assert (
-            y2["model_outputs"].shape == y["model_outputs"].shape
-        ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format(
-            y["model_outputs"].shape, y2["model_outputs"].shape
+        assert y2["model_outputs"].shape == y["model_outputs"].shape, (
+            "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format(
+                y["model_outputs"].shape, y2["model_outputs"].shape
+            )
         )
 
     def test_inference_with_MAS(self):
@@ -261,7 +261,7 @@ def test_train_step(self):
         # reference model to compare model weights
         model_ref = GlowTTS(config).to(device)
         model.train()
-        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for GlowTTS model:{count_parameters(model)}")
         # pass the state to ref model
         model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
         count = 0
diff --git a/tests/tts_tests/test_losses.py b/tests/tts_tests/test_losses.py
index 794478dca3..2290e9a6cc 100644
--- a/tests/tts_tests/test_losses.py
+++ b/tests/tts_tests/test_losses.py
@@ -21,7 +21,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_target = T.zeros(4, 8, 128).float()
         dummy_length = (T.ones(4) * 8).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 8, 128).float()
@@ -29,14 +29,14 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         dummy_input = T.rand(4, 8, 128).float()
         dummy_target = dummy_input.detach()
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
+        assert output.item() == 0, f"0 vs {output.item()}"
 
         # seq_len_norm = True
         # test input == target
@@ -52,7 +52,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_target = T.zeros(4, 8, 128).float()
         dummy_length = (T.ones(4) * 8).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 8, 128).float()
@@ -60,14 +60,14 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
+        assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}"
 
         dummy_input = T.rand(4, 8, 128).float()
         dummy_target = dummy_input.detach()
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
+        assert output.item() == 0, f"0 vs {output.item()}"
 
 
 class MSELossMaskedTests(unittest.TestCase):
@@ -85,7 +85,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_target = T.zeros(4, 8, 128).float()
         dummy_length = (T.ones(4) * 8).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 8, 128).float()
@@ -93,14 +93,14 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         dummy_input = T.rand(4, 8, 128).float()
         dummy_target = dummy_input.detach()
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
+        assert output.item() == 0, f"0 vs {output.item()}"
 
         # seq_len_norm = True
         # test input == target
@@ -116,7 +116,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_target = T.zeros(4, 8, 128).float()
         dummy_length = (T.ones(4) * 8).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 8, 128).float()
@@ -124,14 +124,14 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
+        assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}"
 
         dummy_input = T.rand(4, 8, 128).float()
         dummy_target = dummy_input.detach()
         dummy_length = (T.arange(5, 9)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
+        assert output.item() == 0, f"0 vs {output.item()}"
 
 
 class SSIMLossTests(unittest.TestCase):
@@ -153,7 +153,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
 
         dummy_length = (T.ones(4) * 58).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() >= 1.0, "0 vs {}".format(output.item())
+        assert output.item() >= 1.0, f"0 vs {output.item()}"
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 57, 128).float()
@@ -168,7 +168,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_length = (T.arange(54, 58)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
+        assert output.item() == 0, f"0 vs {output.item()}"
 
         # seq_len_norm = True
         # test input == target
@@ -184,7 +184,7 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_target = T.zeros(4, 57, 128).float()
         dummy_length = (T.ones(4) * 8).long()
         output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+        assert output.item() == 1.0, f"1.0 vs {output.item()}"
 
         # test if padded values of input makes any difference
         dummy_input = T.ones(4, 57, 128).float()
@@ -192,14 +192,14 @@ def test_in_out(self):  # pylint: disable=no-self-use
         dummy_length = (T.arange(54, 58)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
+        assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}"
 
         dummy_input = T.rand(4, 57, 128).float()
         dummy_target = dummy_input.detach()
         dummy_length = (T.arange(54, 58)).long()
         mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
         output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
+        assert output.item() == 0, f"0 vs {output.item()}"
 
 
 class BCELossTest(unittest.TestCase):
diff --git a/tests/tts_tests/test_neuralhmm_tts_train.py b/tests/tts_tests/test_neuralhmm_tts_train.py
deleted file mode 100644
index 25d9aa8148..0000000000
--- a/tests/tts_tests/test_neuralhmm_tts_train.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-import torch
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.neuralhmm_tts_config import NeuralhmmTTSConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt")
-
-torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path)
-
-config = NeuralhmmTTSConfig(
-    batch_size=3,
-    eval_batch_size=3,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="phoneme_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    mel_statistics_parameter_path=parameter_path,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    max_sampling_time=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-
-# train the model for one epoch when mel parameters exists
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
-
-
-# train the model for one epoch when mel parameters have to be computed from the dataset
-if os.path.exists(parameter_path):
-    os.remove(parameter_path)
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_overflow_train.py b/tests/tts_tests/test_overflow_train.py
deleted file mode 100644
index 86fa60af72..0000000000
--- a/tests/tts_tests/test_overflow_train.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-import torch
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.overflow_config import OverflowConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt")
-
-torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path)
-
-config = OverflowConfig(
-    batch_size=3,
-    eval_batch_size=3,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="phoneme_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    mel_statistics_parameter_path=parameter_path,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    max_sampling_time=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-
-# train the model for one epoch when mel parameters exists
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
-
-
-# train the model for one epoch when mel parameters have to be computed from the dataset
-if os.path.exists(parameter_path):
-    os.remove(parameter_path)
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py
deleted file mode 100644
index 530781ef88..0000000000
--- a/tests/tts_tests/test_speedy_speech_train.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = SpeedySpeechConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py
deleted file mode 100644
index 99ba4349c4..0000000000
--- a/tests/tts_tests/test_tacotron2_d-vectors_train.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.tacotron2_config import Tacotron2Config
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = Tacotron2Config(
-    r=5,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    use_speaker_embedding=False,
-    use_d_vector_file=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    d_vector_file="tests/data/ljspeech/speakers.json",
-    d_vector_dim=256,
-    max_decoder_steps=50,
-)
-
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = config.d_vector_file
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py
index 72b6bcd46b..72069bf943 100644
--- a/tests/tts_tests/test_tacotron2_model.py
+++ b/tests/tts_tests/test_tacotron2_model.py
@@ -72,8 +72,8 @@ def test_train_step(self):  # pylint: disable=no-self-use
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
             # if count not in [145, 59]:
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -131,8 +131,8 @@ def test_train_step():
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
             # if count not in [145, 59]:
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -198,8 +198,8 @@ def test_train_step(self):
             if name == "gst_layer.encoder.recurrence.weight_hh_l0":
                 # print(param.grad)
                 continue
-            assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format(
-                name, count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -254,8 +254,8 @@ def test_train_step(self):
             if name == "gst_layer.encoder.recurrence.weight_hh_l0":
                 # print(param.grad)
                 continue
-            assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format(
-                name, count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -321,8 +321,8 @@ def test_train_step():
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -384,7 +384,7 @@ def test_train_step():
             name, param = name_param
             if name == "gst_layer.encoder.recurrence.weight_hh_l0":
                 continue
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py
deleted file mode 100644
index 5f1bc3fd50..0000000000
--- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.tacotron2_config import Tacotron2Config
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = Tacotron2Config(
-    r=5,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    use_speaker_embedding=True,
-    num_speakers=4,
-    max_decoder_steps=50,
-)
-
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py
deleted file mode 100644
index 40107070e1..0000000000
--- a/tests/tts_tests/test_tacotron2_train.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.tacotron2_config import Tacotron2Config
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = Tacotron2Config(
-    r=5,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    max_decoder_steps=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_tacotron_layers.py b/tests/tts_tests/test_tacotron_layers.py
index 43e72417c2..9521cfea26 100644
--- a/tests/tts_tests/test_tacotron_layers.py
+++ b/tests/tts_tests/test_tacotron_layers.py
@@ -67,8 +67,8 @@ def test_in_out():
         output, alignment, stop_tokens = layer(dummy_input, dummy_memory, mask=None)
 
         assert output.shape[0] == 4
-        assert output.shape[1] == 80, "size not {}".format(output.shape[1])
-        assert output.shape[2] == 2, "size not {}".format(output.shape[2])
+        assert output.shape[1] == 80, f"size not {output.shape[1]}"
+        assert output.shape[2] == 2, f"size not {output.shape[2]}"
         assert stop_tokens.shape[0] == 4
 
 
diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py
index 7ec3f0df1b..5f9af86e7e 100644
--- a/tests/tts_tests/test_tacotron_model.py
+++ b/tests/tts_tests/test_tacotron_model.py
@@ -51,7 +51,7 @@ def test_train_step():
         criterion_st = nn.BCEWithLogitsLoss().to(device)
         model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
         model.train()
-        print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for Tacotron model:{count_parameters(model)}")
         model_ref = copy.deepcopy(model)
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
@@ -71,8 +71,8 @@ def test_train_step():
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
             # if count not in [145, 59]:
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -105,7 +105,7 @@ def test_train_step():
         config.d_vector_dim = 55
         model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
         model.train()
-        print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for Tacotron model:{count_parameters(model)}")
         model_ref = copy.deepcopy(model)
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
@@ -127,8 +127,8 @@ def test_train_step():
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
             # if count not in [145, 59]:
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -165,7 +165,7 @@ def test_train_step():
         model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
         model.train()
         # print(model)
-        print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for Tacotron GST model:{count_parameters(model)}")
         model_ref = copy.deepcopy(model)
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
@@ -186,8 +186,8 @@ def test_train_step():
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -217,7 +217,7 @@ def test_train_step():
         model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
         model.train()
         # print(model)
-        print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for Tacotron GST model:{count_parameters(model)}")
         model_ref = copy.deepcopy(model)
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
@@ -238,8 +238,8 @@ def test_train_step():
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -288,7 +288,7 @@ def test_train_step():
         criterion = model.get_criterion()
         optimizer = model.get_optimizer()
         model.train()
-        print(" > Num parameters for Tacotron with Capacitron VAE model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for Tacotron with Capacitron VAE model:{count_parameters(model)}")
         model_ref = copy.deepcopy(model)
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
@@ -305,8 +305,8 @@ def test_train_step():
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
             # ignore pre-higway layer since it works conditional
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
 
@@ -341,7 +341,7 @@ def test_train_step():
         config.d_vector_dim = 55
         model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
         model.train()
-        print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for Tacotron model:{count_parameters(model)}")
         model_ref = copy.deepcopy(model)
         count = 0
         for param, param_ref in zip(model.parameters(), model_ref.parameters()):
@@ -366,7 +366,7 @@ def test_train_step():
             name, param = name_param
             if name == "gst_layer.encoder.recurrence.weight_hh_l0":
                 continue
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count += 1
diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py
deleted file mode 100644
index f7751931ae..0000000000
--- a/tests/tts_tests/test_tacotron_train.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import glob
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.tacotron_config import TacotronConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = TacotronConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    r=5,
-    max_decoder_steps=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py
index 17992773ad..790439ecb2 100644
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@@ -13,14 +13,10 @@
     Vits,
     VitsArgs,
     VitsAudioConfig,
-    amp_to_db,
-    db_to_amp,
     load_audio,
-    spec_to_mel,
-    wav_to_mel,
-    wav_to_spec,
 )
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.utils.audio.torch_transforms import amp_to_db, db_to_amp, spec_to_mel, wav_to_mel, wav_to_spec
 
 LANG_FILE = os.path.join(get_tests_input_path(), "language_ids.json")
 SPEAKER_ENCODER_CONFIG = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
@@ -377,8 +373,8 @@ def _check_parameter_changes(model, model_ref):
             name = item1[0]
             param = item1[1]
             param_ref = item2[1]
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                name, param.shape, param, param_ref
+            assert (param != param_ref).any(), (
+                f"param {name} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
             )
             count = count + 1
 
diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py
deleted file mode 100644
index 741bda91e9..0000000000
--- a/tests/tts_tests/test_vits_d-vectors_train.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.vits_config import VitsConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech-0"],
-    ],
-)
-# set audio config
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-
-# active multispeaker d-vec mode
-config.model_args.use_d_vector_file = True
-config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
-config.model_args.d_vector_dim = 256
-
-
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py
deleted file mode 100644
index 71597ef32f..0000000000
--- a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.config.shared_configs import BaseDatasetConfig
-from TTS.tts.configs.vits_config import VitsConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-dataset_config_en = BaseDatasetConfig(
-    formatter="ljspeech",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    path="tests/data/ljspeech",
-    language="en",
-)
-
-dataset_config_pt = BaseDatasetConfig(
-    formatter="ljspeech",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    path="tests/data/ljspeech",
-    language="pt-br",
-)
-
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech", None, "en"],
-        ["Be a voice, not an echo.", "ljspeech", None, "pt-br"],
-    ],
-    datasets=[dataset_config_en, dataset_config_pt],
-)
-# set audio config
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-
-# active multilingual mode
-config.model_args.use_language_embedding = True
-config.use_language_embedding = True
-# active multispeaker mode
-config.model_args.use_speaker_embedding = True
-config.use_speaker_embedding = True
-
-# deactivate multispeaker d-vec mode
-config.model_args.use_d_vector_file = False
-config.use_d_vector_file = False
-
-# duration predictor
-config.model_args.use_sdp = False
-config.use_sdp = False
-
-# active language sampler
-config.use_language_weighted_sampler = True
-
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech"
-languae_id = "en"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
-continue_languages_path = os.path.join(continue_path, "language_ids.json")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
deleted file mode 100644
index fd58db534a..0000000000
--- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.config.shared_configs import BaseDatasetConfig
-from TTS.tts.configs.vits_config import VitsConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-dataset_config_en = BaseDatasetConfig(
-    formatter="ljspeech_test",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    path="tests/data/ljspeech",
-    language="en",
-)
-
-dataset_config_pt = BaseDatasetConfig(
-    formatter="ljspeech_test",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    path="tests/data/ljspeech",
-    language="pt-br",
-)
-
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="multilingual_cleaners",
-    use_phonemes=False,
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech-0", None, "en"],
-        ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"],
-    ],
-    datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt],
-)
-# set audio config
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-
-# active multilingual mode
-config.model_args.use_language_embedding = True
-config.use_language_embedding = True
-
-# deactivate multispeaker mode
-config.model_args.use_speaker_embedding = False
-config.use_speaker_embedding = False
-
-# active multispeaker d-vec mode
-config.model_args.use_d_vector_file = True
-config.use_d_vector_file = True
-config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
-config.d_vector_file = ["tests/data/ljspeech/speakers.json"]
-config.model_args.d_vector_dim = 256
-config.d_vector_dim = 256
-
-# duration predictor
-config.model_args.use_sdp = True
-config.use_sdp = True
-
-# activate language and speaker samplers
-config.use_language_weighted_sampler = True
-config.language_weighted_sampler_alpha = 10
-config.use_speaker_weighted_sampler = True
-config.speaker_weighted_sampler_alpha = 5
-
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-languae_id = "en"
-continue_speakers_path = config.d_vector_file
-continue_languages_path = os.path.join(continue_path, "language_ids.json")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py
deleted file mode 100644
index b7fe197cfe..0000000000
--- a/tests/tts_tests/test_vits_speaker_emb_train.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.vits_config import VitsConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech-1"],
-    ],
-)
-# set audio config
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-
-# active multispeaker d-vec mode
-config.model_args.use_speaker_embedding = True
-config.model_args.use_d_vector_file = False
-config.model_args.d_vector_file = None
-config.model_args.d_vector_dim = 256
-
-
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py
deleted file mode 100644
index ea5dc02405..0000000000
--- a/tests/tts_tests/test_vits_train.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.vits_config import VitsConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo."],
-    ],
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/__init__.py b/tests/tts_tests2/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tests/tts_tests2/test_align_tts_train.py b/tests/tts_tests2/test_align_tts_train.py
deleted file mode 100644
index 9b0b730df4..0000000000
--- a/tests/tts_tests2/test_align_tts_train.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.align_tts_config import AlignTTSConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = AlignTTSConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-)
-
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py
deleted file mode 100644
index 8fc4ea7e9b..0000000000
--- a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig
-from TTS.tts.models.delightful_tts import DelightfulTtsArgs, VocoderConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-audio_config = DelightfulTtsAudioConfig()
-model_args = DelightfulTtsArgs(
-    use_speaker_embedding=False, d_vector_dim=256, use_d_vector_file=True, speaker_embedding_channels=256
-)
-
-vocoder_config = VocoderConfig()
-
-config = DelightfulTTSConfig(
-    model_args=model_args,
-    audio=audio_config,
-    vocoder=vocoder_config,
-    batch_size=2,
-    eval_batch_size=8,
-    compute_f0=True,
-    run_eval=True,
-    test_delay_epochs=-1,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache_delightful/",  ## delightful f0 cache is incompatible with other models
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    binary_align_loss_alpha=0.0,
-    use_attn_priors=False,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech-0"],
-    ],
-    output_path=output_path,
-    use_speaker_embedding=False,
-    use_d_vector_file=True,
-    d_vector_file="tests/data/ljspeech/speakers.json",
-    d_vector_dim=256,
-    speaker_embedding_channels=256,
-)
-
-# active multispeaker d-vec mode
-config.model_args.use_speaker_embedding = False
-config.model_args.use_d_vector_file = True
-config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
-config.model_args.d_vector_dim = 256
-
-
-config.save_json(config_path)
-
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-speaker_id = "ljspeech-1"
-continue_speakers_path = config.d_vector_file
-
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --speakers_file_path {continue_speakers_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
-shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/")
diff --git a/tests/tts_tests2/test_delightful_tts_emb_spk.py b/tests/tts_tests2/test_delightful_tts_emb_spk.py
deleted file mode 100644
index 6fb70c5f61..0000000000
--- a/tests/tts_tests2/test_delightful_tts_emb_spk.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig
-from TTS.tts.models.delightful_tts import DelightfulTtsArgs, VocoderConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-audio_config = DelightfulTtsAudioConfig()
-model_args = DelightfulTtsArgs(use_speaker_embedding=False)
-
-vocoder_config = VocoderConfig()
-
-config = DelightfulTTSConfig(
-    model_args=model_args,
-    audio=audio_config,
-    vocoder=vocoder_config,
-    batch_size=2,
-    eval_batch_size=8,
-    compute_f0=True,
-    run_eval=True,
-    test_delay_epochs=-1,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache_delightful/",  ## delightful f0 cache is incompatible with other models
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    binary_align_loss_alpha=0.0,
-    use_attn_priors=False,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech"],
-    ],
-    output_path=output_path,
-    num_speakers=4,
-    use_speaker_embedding=True,
-)
-
-# active multispeaker d-vec mode
-config.model_args.use_speaker_embedding = True
-config.model_args.use_d_vector_file = False
-config.model_args.d_vector_file = None
-config.model_args.d_vector_dim = 256
-
-
-config.save_json(config_path)
-
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.dataset_name ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech"
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
-shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/")
diff --git a/tests/tts_tests2/test_delightful_tts_train.py b/tests/tts_tests2/test_delightful_tts_train.py
deleted file mode 100644
index a917d77657..0000000000
--- a/tests/tts_tests2/test_delightful_tts_train.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.config.shared_configs import BaseAudioConfig
-from TTS.tts.configs.delightful_tts_config import DelightfulTTSConfig
-from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
-
-audio_config = DelightfulTtsAudioConfig()
-model_args = DelightfulTtsArgs()
-
-vocoder_config = VocoderConfig()
-
-
-config = DelightfulTTSConfig(
-    audio=audio_config,
-    batch_size=2,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache_delightful/",  ## delightful f0 cache is incompatible with other models
-    run_eval=True,
-    test_delay_epochs=-1,
-    binary_align_loss_alpha=0.0,
-    epochs=1,
-    print_step=1,
-    use_attn_priors=False,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo."],
-    ],
-    use_speaker_embedding=False,
-)
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{'cpu'}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs -1"
-)
-
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == -1
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
-shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/")
diff --git a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py
deleted file mode 100644
index 7f79bfcab2..0000000000
--- a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.config.shared_configs import BaseAudioConfig
-from TTS.tts.configs.fast_pitch_config import FastPitchConfig
-
-config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
-
-config = FastPitchConfig(
-    audio=audio_config,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    use_speaker_embedding=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-)
-config.audio.do_trim_silence = True
-config.use_speaker_embedding = True
-config.model_args.use_speaker_embedding = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_fast_pitch_train.py b/tests/tts_tests2/test_fast_pitch_train.py
deleted file mode 100644
index a525715b53..0000000000
--- a/tests/tts_tests2/test_fast_pitch_train.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.config.shared_configs import BaseAudioConfig
-from TTS.tts.configs.fast_pitch_config import FastPitchConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
-
-config = FastPitchConfig(
-    audio=audio_config,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    use_speaker_embedding=False,
-)
-config.audio.do_trim_silence = True
-config.use_speaker_embedding = False
-config.model_args.use_speaker_embedding = False
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py b/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py
deleted file mode 100644
index 35bda597d5..0000000000
--- a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.config.shared_configs import BaseAudioConfig
-from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
-
-config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
-
-config = Fastspeech2Config(
-    audio=audio_config,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache/",
-    compute_f0=True,
-    compute_energy=True,
-    energy_cache_path="tests/data/ljspeech/energy_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    use_speaker_embedding=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-)
-config.audio.do_trim_silence = True
-config.use_speaker_embedding = True
-config.model_args.use_speaker_embedding = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_fastspeech_2_train.py b/tests/tts_tests2/test_fastspeech_2_train.py
deleted file mode 100644
index dd4b07d240..0000000000
--- a/tests/tts_tests2/test_fastspeech_2_train.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.config.shared_configs import BaseAudioConfig
-from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
-
-config = Fastspeech2Config(
-    audio=audio_config,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache/",
-    compute_f0=True,
-    compute_energy=True,
-    energy_cache_path="tests/data/ljspeech/energy_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    use_speaker_embedding=False,
-)
-config.audio.do_trim_silence = True
-config.use_speaker_embedding = False
-config.model_args.use_speaker_embedding = False
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_glow_tts_d-vectors_train.py b/tests/tts_tests2/test_glow_tts_d-vectors_train.py
deleted file mode 100644
index f1cfd4368f..0000000000
--- a/tests/tts_tests2/test_glow_tts_d-vectors_train.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.glow_tts_config import GlowTTSConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = GlowTTSConfig(
-    batch_size=2,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    data_dep_init_steps=1.0,
-    use_speaker_embedding=False,
-    use_d_vector_file=True,
-    d_vector_file="tests/data/ljspeech/speakers.json",
-    d_vector_dim=256,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = config.d_vector_file
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py b/tests/tts_tests2/test_glow_tts_speaker_emb_train.py
deleted file mode 100644
index b1eb6237a4..0000000000
--- a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.glow_tts_config import GlowTTSConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = GlowTTSConfig(
-    batch_size=2,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    data_dep_init_steps=1.0,
-    use_speaker_embedding=True,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_glow_tts_train.py b/tests/tts_tests2/test_glow_tts_train.py
deleted file mode 100644
index 0a8e226b65..0000000000
--- a/tests/tts_tests2/test_glow_tts_train.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import glob
-import json
-import os
-import shutil
-
-from trainer import get_last_checkpoint
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.glow_tts_config import GlowTTSConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = GlowTTSConfig(
-    batch_size=2,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    data_dep_init_steps=1.0,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vc_tests/test_freevc.py b/tests/vc_tests/test_freevc.py
index c90551b494..784e32a68d 100644
--- a/tests/vc_tests/test_freevc.py
+++ b/tests/vc_tests/test_freevc.py
@@ -22,31 +22,19 @@
 
 class TestFreeVC(unittest.TestCase):
     def _create_inputs(self, config, batch_size=2):
-        input_dummy = torch.rand(batch_size, 30 * config.audio["hop_length"]).to(device)
-        input_lengths = torch.randint(100, 30 * config.audio["hop_length"], (batch_size,)).long().to(device)
-        input_lengths[-1] = 30 * config.audio["hop_length"]
         spec = torch.rand(batch_size, 30, config.audio["filter_length"] // 2 + 1).to(device)
         mel = torch.rand(batch_size, 30, config.audio["n_mel_channels"]).to(device)
         spec_lengths = torch.randint(20, 30, (batch_size,)).long().to(device)
         spec_lengths[-1] = spec.size(2)
         waveform = torch.rand(batch_size, spec.size(2) * config.audio["hop_length"]).to(device)
-        return input_dummy, input_lengths, mel, spec, spec_lengths, waveform
+        return mel, spec, spec_lengths, waveform
 
     @staticmethod
     def _create_inputs_inference():
-        source_wav = torch.rand(16000)
+        source_wav = torch.rand(15999)
         target_wav = torch.rand(16000)
         return source_wav, target_wav
 
-    @staticmethod
-    def _check_parameter_changes(model, model_ref):
-        count = 0
-        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
-            )
-            count += 1
-
     def test_methods(self):
         config = FreeVCConfig()
         model = FreeVC(config).to(device)
@@ -67,9 +55,9 @@ def _test_forward(self, batch_size):
         config = FreeVCConfig()
         model = FreeVC(config).to(device)
         model.train()
-        print(" > Num parameters for FreeVC model:%s" % (count_parameters(model)))
+        print(f" > Num parameters for FreeVC model:{count_parameters(model)}")
 
-        _, _, mel, spec, spec_lengths, waveform = self._create_inputs(config, batch_size)
+        mel, spec, spec_lengths, waveform = self._create_inputs(config, batch_size)
 
         wavlm_vec = model.extract_wavlm_features(waveform)
         wavlm_vec_lengths = torch.ones(batch_size, dtype=torch.long)
@@ -86,15 +74,15 @@ def _test_inference(self, batch_size):
         model = FreeVC(config).to(device)
         model.eval()
 
-        _, _, mel, _, _, waveform = self._create_inputs(config, batch_size)
+        mel, _, _, waveform = self._create_inputs(config, batch_size)
 
         wavlm_vec = model.extract_wavlm_features(waveform)
         wavlm_vec_lengths = torch.ones(batch_size, dtype=torch.long)
 
         output_wav = model.inference(wavlm_vec, None, mel, wavlm_vec_lengths)
-        assert (
-            output_wav.shape[-1] // config.audio.hop_length == wavlm_vec.shape[-1]
-        ), f"{output_wav.shape[-1] // config.audio.hop_length} != {wavlm_vec.shape}"
+        assert output_wav.shape[-1] // config.audio.hop_length == wavlm_vec.shape[-1], (
+            f"{output_wav.shape[-1] // config.audio.hop_length} != {wavlm_vec.shape}"
+        )
 
     def test_inference(self):
         self._test_inference(1)
@@ -107,9 +95,9 @@ def test_voice_conversion(self):
 
         source_wav, target_wav = self._create_inputs_inference()
         output_wav = model.voice_conversion(source_wav, target_wav)
-        assert (
-            output_wav.shape[0] + config.audio.hop_length == source_wav.shape[0]
-        ), f"{output_wav.shape} != {source_wav.shape}"
+        assert output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length, (
+            f"{output_wav.shape} != {source_wav.shape}, {config.audio.hop_length}"
+        )
 
     def test_train_step(self): ...
 
diff --git a/tests/vc_tests/test_openvoice.py b/tests/vc_tests/test_openvoice.py
new file mode 100644
index 0000000000..703873ea47
--- /dev/null
+++ b/tests/vc_tests/test_openvoice.py
@@ -0,0 +1,41 @@
+import os
+import unittest
+
+import torch
+
+from tests import get_tests_input_path
+from TTS.vc.models.openvoice import OpenVoice, OpenVoiceConfig
+
+torch.manual_seed(1)
+use_cuda = torch.cuda.is_available()
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+c = OpenVoiceConfig()
+
+WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
+
+
+class TestOpenVoice(unittest.TestCase):
+    @staticmethod
+    def _create_inputs_inference():
+        source_wav = torch.rand(16100)
+        target_wav = torch.rand(16000)
+        return source_wav, target_wav
+
+    def test_load_audio(self):
+        config = OpenVoiceConfig()
+        model = OpenVoice(config).to(device)
+        wav = model.load_audio(WAV_FILE)
+        wav2 = model.load_audio(wav)
+        assert all(torch.isclose(wav, wav2))
+
+    def test_voice_conversion(self):
+        config = OpenVoiceConfig()
+        model = OpenVoice(config).to(device)
+        model.eval()
+
+        source_wav, target_wav = self._create_inputs_inference()
+        output_wav = model.voice_conversion(source_wav, target_wav)
+        assert output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length, (
+            f"{output_wav.shape} != {source_wav.shape}"
+        )
diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py
deleted file mode 100644
index 9d4e193382..0000000000
--- a/tests/vocoder_tests/test_fullband_melgan_train.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import FullbandMelganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = FullbandMelganConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=8192,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py
deleted file mode 100644
index c506fb48dc..0000000000
--- a/tests/vocoder_tests/test_hifigan_train.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import HifiganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = HifiganConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=1024,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py
deleted file mode 100644
index 6ef9cd495b..0000000000
--- a/tests/vocoder_tests/test_melgan_train.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import MelganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = MelganConfig(
-    batch_size=4,
-    eval_batch_size=4,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=2048,
-    eval_split_size=1,
-    print_step=1,
-    discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py
deleted file mode 100644
index 8002760706..0000000000
--- a/tests/vocoder_tests/test_multiband_melgan_train.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import MultibandMelganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = MultibandMelganConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=8192,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    steps_to_start_discriminator=1,
-    data_path="tests/data/ljspeech",
-    discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py
deleted file mode 100644
index a126befe2e..0000000000
--- a/tests/vocoder_tests/test_parallel_wavegan_train.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import ParallelWaveganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = ParallelWaveganConfig(
-    batch_size=4,
-    eval_batch_size=4,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=2048,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_vocoder_gan_datasets.py b/tests/vocoder_tests/test_vocoder_gan_datasets.py
index c39d70e94c..d540667ee8 100644
--- a/tests/vocoder_tests/test_vocoder_gan_datasets.py
+++ b/tests/vocoder_tests/test_vocoder_gan_datasets.py
@@ -3,16 +3,12 @@
 import numpy as np
 from torch.utils.data import DataLoader
 
-from tests import get_tests_output_path, get_tests_path
+from tests import get_tests_path
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.configs import BaseGANVocoderConfig
 from TTS.vocoder.datasets.gan_dataset import GANDataset
 from TTS.vocoder.datasets.preprocess import load_wav_data
 
-file_path = os.path.dirname(os.path.realpath(__file__))
-OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
-os.makedirs(OUTPATH, exist_ok=True)
-
 C = BaseGANVocoderConfig()
 
 test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
diff --git a/tests/vocoder_tests/test_vocoder_losses.py b/tests/vocoder_tests/test_vocoder_losses.py
index 95501c2d39..c9432d7f4b 100644
--- a/tests/vocoder_tests/test_vocoder_losses.py
+++ b/tests/vocoder_tests/test_vocoder_losses.py
@@ -2,17 +2,12 @@
 
 import torch
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+from tests import get_tests_input_path
 from TTS.config import BaseAudioConfig
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.audio.numpy_transforms import stft
 from TTS.vocoder.layers.losses import MelganFeatureLoss, MultiScaleSTFTLoss, STFTLoss, TorchSTFT
 
-TESTS_PATH = get_tests_path()
-
-OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
-os.makedirs(OUT_PATH, exist_ok=True)
-
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
 ap = AudioProcessor(**BaseAudioConfig().to_dict())
diff --git a/tests/vocoder_tests/test_vocoder_pqmf.py b/tests/vocoder_tests/test_vocoder_pqmf.py
index afe8d1dc8f..9be492927d 100644
--- a/tests/vocoder_tests/test_vocoder_pqmf.py
+++ b/tests/vocoder_tests/test_vocoder_pqmf.py
@@ -4,14 +4,13 @@
 import torch
 from librosa.core import load
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+from tests import get_tests_input_path
 from TTS.vocoder.layers.pqmf import PQMF
 
-TESTS_PATH = get_tests_path()
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
 
-def test_pqmf():
+def test_pqmf(tmp_path):
     w, sr = load(WAV_FILE)
 
     layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0)
@@ -23,4 +22,4 @@ def test_pqmf():
     print(w2_.max())
     print(w2_.min())
     print(w2_.mean())
-    sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr)
+    sf.write(tmp_path / "pqmf_output.wav", w2_.flatten().detach(), sr)
diff --git a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py
index 503b4e2483..c3ae1309dc 100644
--- a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py
+++ b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py
@@ -1,29 +1,38 @@
 import os
-import shutil
 
 import numpy as np
+import pytest
 from torch.utils.data import DataLoader
 
-from tests import get_tests_output_path, get_tests_path
+from tests import get_tests_path
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.configs import WavernnConfig
 from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files
 from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
 
-file_path = os.path.dirname(os.path.realpath(__file__))
-OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
-os.makedirs(OUTPATH, exist_ok=True)
-
 C = WavernnConfig()
 
 test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
-test_mel_feat_path = os.path.join(test_data_path, "mel")
-test_quant_feat_path = os.path.join(test_data_path, "quant")
-ok_ljspeech = os.path.exists(test_data_path)
 
+params = [
+    [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0],
+    [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4],
+    [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0],
+    [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0],
+    [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0],
+    [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2],
+    [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0],
+]
+
+
+@pytest.mark.parametrize("params", params)
+def test_parametrized_wavernn_dataset(tmp_path, params):
+    """Run dataloader with given parameters and check conditions"""
+    print(params)
+    batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers = params
+    test_mel_feat_path = tmp_path / "mel"
+    test_quant_feat_path = tmp_path / "quant"
 
-def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers):
-    """run dataloader with given parameters and check conditions"""
     ap = AudioProcessor(**C.audio)
 
     C.batch_size = batch_size
@@ -31,7 +40,7 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor
     C.seq_len = seq_len
     C.data_path = test_data_path
 
-    preprocess_wav_files(test_data_path, C, ap)
+    preprocess_wav_files(tmp_path, C, ap)
     _, train_items = load_wav_feat_data(test_data_path, test_mel_feat_path, 5)
 
     dataset = WaveRNNDataset(
@@ -50,35 +59,12 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor
     max_iter = 10
     count_iter = 0
 
-    try:
-        for data in loader:
-            x_input, mels, _ = data
-            expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2))
-            assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}"
-
-            assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1]
-            count_iter += 1
-            if count_iter == max_iter:
-                break
-    # except AssertionError:
-    #     shutil.rmtree(test_mel_feat_path)
-    #     shutil.rmtree(test_quant_feat_path)
-    finally:
-        shutil.rmtree(test_mel_feat_path)
-        shutil.rmtree(test_quant_feat_path)
-
+    for data in loader:
+        x_input, mels, _ = data
+        expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2))
+        assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}"
 
-def test_parametrized_wavernn_dataset():
-    """test dataloader with different parameters"""
-    params = [
-        [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0],
-        [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4],
-        [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0],
-        [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0],
-        [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0],
-        [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2],
-        [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0],
-    ]
-    for param in params:
-        print(param)
-        wavernn_dataset_case(*param)
+        assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1]
+        count_iter += 1
+        if count_iter == max_iter:
+            break
diff --git a/tests/vocoder_tests/test_wavegrad.py b/tests/vocoder_tests/test_wavegrad.py
index 43b5f08042..d1d3610b70 100644
--- a/tests/vocoder_tests/test_wavegrad.py
+++ b/tests/vocoder_tests/test_wavegrad.py
@@ -1,5 +1,3 @@
-import unittest
-
 import numpy as np
 import torch
 from torch import optim
@@ -10,50 +8,43 @@
 # pylint: disable=unused-variable
 
 torch.manual_seed(1)
-use_cuda = torch.cuda.is_available()
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 
-class WavegradTrainTest(unittest.TestCase):
-    def test_train_step(self):  # pylint: disable=no-self-use
-        """Test if all layers are updated in a basic training cycle"""
-        input_dummy = torch.rand(8, 1, 20 * 300).to(device)
-        mel_spec = torch.rand(8, 80, 20).to(device)
-
-        criterion = torch.nn.L1Loss().to(device)
-        args = WavegradArgs(
-            in_channels=80,
-            out_channels=1,
-            upsample_factors=[5, 5, 3, 2, 2],
-            upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
-        )
-        config = WavegradConfig(model_params=args)
-        model = Wavegrad(config)
-
-        model_ref = Wavegrad(config)
-        model.train()
-        model.to(device)
-        betas = np.linspace(1e-6, 1e-2, 1000)
-        model.compute_noise_level(betas)
-        model_ref.load_state_dict(model.state_dict())
-        model_ref.to(device)
-        count = 0
-        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
-            assert (param - param_ref).sum() == 0, param
-            count += 1
-        optimizer = optim.Adam(model.parameters(), lr=0.001)
-        for i in range(5):
-            y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device))
-            optimizer.zero_grad()
-            loss = criterion(y_hat, input_dummy)
-            loss.backward()
-            optimizer.step()
-        # check parameter changes
-        count = 0
-        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
-            # ignore pre-higway layer since it works conditional
-            # if count not in [145, 59]:
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
-            )
-            count += 1
+def test_train_step():
+    """Test if all layers are updated in a basic training cycle"""
+    torch.set_grad_enabled(True)
+    input_dummy = torch.rand(8, 1, 20 * 300).to(device)
+    mel_spec = torch.rand(8, 80, 20).to(device)
+
+    criterion = torch.nn.L1Loss().to(device)
+    args = WavegradArgs(
+        in_channels=80,
+        out_channels=1,
+        upsample_factors=[5, 5, 3, 2, 2],
+        upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
+    )
+    config = WavegradConfig(model_params=args)
+    model = Wavegrad(config)
+
+    model_ref = Wavegrad(config)
+    model.train()
+    model.to(device)
+    betas = np.linspace(1e-6, 1e-2, 1000)
+    model.compute_noise_level(betas)
+    model_ref.load_state_dict(model.state_dict())
+    model_ref.to(device)
+    for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+        assert (param - param_ref).sum() == 0, param
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+    for _ in range(5):
+        y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device))
+        optimizer.zero_grad()
+        loss = criterion(y_hat, input_dummy)
+        loss.backward()
+        optimizer.step()
+    # check parameter changes
+    for i, (param, param_ref) in enumerate(zip(model.parameters(), model_ref.parameters())):
+        # ignore pre-higway layer since it works conditional
+        # if count not in [145, 59]:
+        assert (param != param_ref).any(), f"param {i} with shape {param.shape} not updated!! \n{param}\n{param_ref}"
diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py
deleted file mode 100644
index 9b10759505..0000000000
--- a/tests/vocoder_tests/test_wavegrad_train.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import glob
-import os
-import shutil
-import unittest
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import WavegradConfig
-
-
-class WavegradTrainingTest(unittest.TestCase):
-    # TODO: Reactivate after improving CI run times
-    # This test currently takes ~2h on CI (15min/step vs 8sec/step locally)
-    if os.getenv("GITHUB_ACTIONS") == "true":
-        __test__ = False
-
-    def test_train(self):  # pylint: disable=no-self-use
-        config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-        output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-        config = WavegradConfig(
-            batch_size=8,
-            eval_batch_size=8,
-            num_loader_workers=0,
-            num_eval_loader_workers=0,
-            run_eval=True,
-            test_delay_epochs=-1,
-            epochs=1,
-            seq_len=8192,
-            eval_split_size=1,
-            print_step=1,
-            print_eval=True,
-            data_path="tests/data/ljspeech",
-            output_path=output_path,
-            test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2},
-        )
-        config.audio.do_trim_silence = True
-        config.audio.trim_db = 60
-        config.save_json(config_path)
-
-        # train the model for one epoch
-        command_train = (
-            f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-        )
-        run_cli(command_train)
-
-        # Find latest folder
-        continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-        # restore the model and continue training for one more epoch
-        command_train = (
-            f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-        )
-        run_cli(command_train)
-        shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py
deleted file mode 100644
index 337e24259f..0000000000
--- a/tests/vocoder_tests/test_wavernn_train.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import WavernnConfig
-from TTS.vocoder.models.wavernn import WavernnArgs
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = WavernnConfig(
-    model_args=WavernnArgs(),
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=256,  # for shorter test time
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/xtts_tests/test_xtts_gpt_train.py b/tests/xtts_tests/test_xtts_gpt_train.py
deleted file mode 100644
index b8b9a4e388..0000000000
--- a/tests/xtts_tests/test_xtts_gpt_train.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import os
-import shutil
-
-import torch
-from trainer import Trainer, TrainerArgs
-
-from tests import get_tests_output_path
-from TTS.config.shared_configs import BaseDatasetConfig
-from TTS.tts.datasets import load_tts_samples
-from TTS.tts.layers.xtts.dvae import DiscreteVAE
-from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
-
-config_dataset = BaseDatasetConfig(
-    formatter="ljspeech",
-    dataset_name="ljspeech",
-    path="tests/data/ljspeech/",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    language="en",
-)
-
-DATASETS_CONFIG_LIST = [config_dataset]
-
-# Logging parameters
-RUN_NAME = "GPT_XTTS_LJSpeech_FT"
-PROJECT_NAME = "XTTS_trainer"
-DASHBOARD_LOGGER = "tensorboard"
-LOGGER_URI = None
-
-# Set here the path that the checkpoints will be saved. Default: ./run/training/
-OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests")
-os.makedirs(OUT_PATH, exist_ok=True)
-
-# Create DVAE checkpoint and mel_norms on test time
-# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model
-DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth")  # DVAE checkpoint
-MEL_NORM_FILE = os.path.join(
-    OUT_PATH, "mel_stats.pth"
-)  # Mel spectrogram norms, required for dvae mel spectrogram extraction
-dvae = DiscreteVAE(
-    channels=80,
-    normalization=None,
-    positional_dims=1,
-    num_tokens=8192,
-    codebook_dim=512,
-    hidden_dim=512,
-    num_resnet_blocks=3,
-    kernel_size=3,
-    num_layers=2,
-    use_transposed_convs=False,
-)
-torch.save(dvae.state_dict(), DVAE_CHECKPOINT)
-mel_stats = torch.ones(80)
-torch.save(mel_stats, MEL_NORM_FILE)
-
-
-# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
-TOKENIZER_FILE = "tests/inputs/xtts_vocab.json"  # vocab.json file
-XTTS_CHECKPOINT = None  # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth"  # model.pth file
-
-
-# Training sentences generations
-SPEAKER_REFERENCE = [
-    "tests/data/ljspeech/wavs/LJ001-0002.wav"
-]  # speaker reference to be used in training test sentences
-LANGUAGE = config_dataset.language
-
-
-# Training Parameters
-OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
-START_WITH_EVAL = False  # if True it will star with evaluation
-BATCH_SIZE = 2  # set here the batch size
-GRAD_ACUMM_STEPS = 1  # set here the grad accumulation steps
-# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
-
-
-# init args and config
-model_args = GPTArgs(
-    max_conditioning_length=132300,  # 6 secs
-    min_conditioning_length=66150,  # 3 secs
-    debug_loading_failures=False,
-    max_wav_length=255995,  # ~11.6 seconds
-    max_text_length=200,
-    mel_norm_file=MEL_NORM_FILE,
-    dvae_checkpoint=DVAE_CHECKPOINT,
-    xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
-    tokenizer_file=TOKENIZER_FILE,
-    gpt_num_audio_tokens=8194,
-    gpt_start_audio_token=8192,
-    gpt_stop_audio_token=8193,
-)
-audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
-config = GPTTrainerConfig(
-    epochs=1,
-    output_path=OUT_PATH,
-    model_args=model_args,
-    run_name=RUN_NAME,
-    project_name=PROJECT_NAME,
-    run_description="""
-        GPT XTTS training
-        """,
-    dashboard_logger=DASHBOARD_LOGGER,
-    logger_uri=LOGGER_URI,
-    audio=audio_config,
-    batch_size=BATCH_SIZE,
-    batch_group_size=48,
-    eval_batch_size=BATCH_SIZE,
-    num_loader_workers=8,
-    eval_split_max_size=256,
-    print_step=50,
-    plot_step=100,
-    log_model_step=1000,
-    save_step=10000,
-    save_n_checkpoints=1,
-    save_checkpoints=True,
-    # target_loss="loss",
-    print_eval=False,
-    # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
-    optimizer="AdamW",
-    optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
-    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
-    lr=5e-06,  # learning rate
-    lr_scheduler="MultiStepLR",
-    # it was adjusted accordly for the new step scheme
-    lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
-    test_sentences=[
-        {
-            "text": "This cake is great. It's so delicious and moist.",
-            "speaker_wav": SPEAKER_REFERENCE,
-            "language": LANGUAGE,
-        },
-    ],
-)
-
-# init the model from config
-model = GPTTrainer.init_from_config(config)
-
-# load training samples
-train_samples, eval_samples = load_tts_samples(
-    DATASETS_CONFIG_LIST,
-    eval_split=True,
-    eval_split_max_size=config.eval_split_max_size,
-    eval_split_size=config.eval_split_size,
-)
-
-# init the trainer and 🚀
-trainer = Trainer(
-    TrainerArgs(
-        restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
-        skip_train_epoch=False,
-        start_with_eval=True,
-        grad_accum_steps=GRAD_ACUMM_STEPS,
-    ),
-    config,
-    output_path=OUT_PATH,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-)
-trainer.fit()
-
-# remove output path
-shutil.rmtree(OUT_PATH)
diff --git a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py b/tests/xtts_tests/test_xtts_v2-0_gpt_train.py
deleted file mode 100644
index 6663433c12..0000000000
--- a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import os
-import shutil
-
-import torch
-from trainer import Trainer, TrainerArgs
-
-from tests import get_tests_output_path
-from TTS.config.shared_configs import BaseDatasetConfig
-from TTS.tts.datasets import load_tts_samples
-from TTS.tts.layers.xtts.dvae import DiscreteVAE
-from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
-
-config_dataset = BaseDatasetConfig(
-    formatter="ljspeech",
-    dataset_name="ljspeech",
-    path="tests/data/ljspeech/",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    language="en",
-)
-
-DATASETS_CONFIG_LIST = [config_dataset]
-
-# Logging parameters
-RUN_NAME = "GPT_XTTS_LJSpeech_FT"
-PROJECT_NAME = "XTTS_trainer"
-DASHBOARD_LOGGER = "tensorboard"
-LOGGER_URI = None
-
-OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests")
-os.makedirs(OUT_PATH, exist_ok=True)
-
-# Create DVAE checkpoint and mel_norms on test time
-# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model
-DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth")  # DVAE checkpoint
-# Mel spectrogram norms, required for dvae mel spectrogram extraction
-MEL_NORM_FILE = os.path.join(OUT_PATH, "mel_stats.pth")
-dvae = DiscreteVAE(
-    channels=80,
-    normalization=None,
-    positional_dims=1,
-    num_tokens=8192,
-    codebook_dim=512,
-    hidden_dim=512,
-    num_resnet_blocks=3,
-    kernel_size=3,
-    num_layers=2,
-    use_transposed_convs=False,
-)
-torch.save(dvae.state_dict(), DVAE_CHECKPOINT)
-mel_stats = torch.ones(80)
-torch.save(mel_stats, MEL_NORM_FILE)
-
-
-# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
-TOKENIZER_FILE = "tests/inputs/xtts_vocab.json"  # vocab.json file
-XTTS_CHECKPOINT = None  # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth"  # model.pth file
-
-
-# Training sentences generations
-SPEAKER_REFERENCE = [
-    "tests/data/ljspeech/wavs/LJ001-0002.wav"
-]  # speaker reference to be used in training test sentences
-LANGUAGE = config_dataset.language
-
-
-# Training Parameters
-OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
-START_WITH_EVAL = False  # if True it will star with evaluation
-BATCH_SIZE = 2  # set here the batch size
-GRAD_ACUMM_STEPS = 1  # set here the grad accumulation steps
-# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
-
-
-# init args and config
-model_args = GPTArgs(
-    max_conditioning_length=132300,  # 6 secs
-    min_conditioning_length=66150,  # 3 secs
-    debug_loading_failures=False,
-    max_wav_length=255995,  # ~11.6 seconds
-    max_text_length=200,
-    mel_norm_file=MEL_NORM_FILE,
-    dvae_checkpoint=DVAE_CHECKPOINT,
-    xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
-    tokenizer_file=TOKENIZER_FILE,
-    gpt_num_audio_tokens=8194,
-    gpt_start_audio_token=8192,
-    gpt_stop_audio_token=8193,
-    gpt_use_masking_gt_prompt_approach=True,
-    gpt_use_perceiver_resampler=True,
-)
-
-audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
-
-config = GPTTrainerConfig(
-    epochs=1,
-    output_path=OUT_PATH,
-    model_args=model_args,
-    run_name=RUN_NAME,
-    project_name=PROJECT_NAME,
-    run_description="GPT XTTS training",
-    dashboard_logger=DASHBOARD_LOGGER,
-    logger_uri=LOGGER_URI,
-    audio=audio_config,
-    batch_size=BATCH_SIZE,
-    batch_group_size=48,
-    eval_batch_size=BATCH_SIZE,
-    num_loader_workers=8,
-    eval_split_max_size=256,
-    print_step=50,
-    plot_step=100,
-    log_model_step=1000,
-    save_step=10000,
-    save_n_checkpoints=1,
-    save_checkpoints=True,
-    # target_loss="loss",
-    print_eval=False,
-    # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
-    optimizer="AdamW",
-    optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
-    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
-    lr=5e-06,  # learning rate
-    lr_scheduler="MultiStepLR",
-    # it was adjusted accordly for the new step scheme
-    lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
-    test_sentences=[
-        {
-            "text": "This cake is great. It's so delicious and moist.",
-            "speaker_wav": SPEAKER_REFERENCE,
-            "language": LANGUAGE,
-        },
-    ],
-)
-
-# init the model from config
-model = GPTTrainer.init_from_config(config)
-
-# load training samples
-train_samples, eval_samples = load_tts_samples(
-    DATASETS_CONFIG_LIST,
-    eval_split=True,
-    eval_split_max_size=config.eval_split_max_size,
-    eval_split_size=config.eval_split_size,
-)
-
-# init the trainer and 🚀
-trainer = Trainer(
-    TrainerArgs(
-        restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
-        skip_train_epoch=False,
-        start_with_eval=True,
-        grad_accum_steps=GRAD_ACUMM_STEPS,
-    ),
-    config,
-    output_path=OUT_PATH,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-)
-trainer.fit()
-
-# remove output path
-shutil.rmtree(OUT_PATH)
diff --git a/tests/zoo_tests/test_big_models.py b/tests/zoo_tests/test_big_models.py
new file mode 100644
index 0000000000..8a9780b4f0
--- /dev/null
+++ b/tests/zoo_tests/test_big_models.py
@@ -0,0 +1,193 @@
+"""These tests should be run locally because the models are too big for CI."""
+
+import os
+
+import pytest
+import torch
+
+from tests import get_tests_data_path, run_main
+from TTS.bin.synthesize import main
+from TTS.utils.manage import ModelManager
+
+GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
+
+
+@pytest.fixture(scope="session", autouse=True)
+def set_env():
+    os.environ["COQUI_TOS_AGREED"] = "1"
+
+
+@pytest.fixture
+def manager():
+    """Set up model manager."""
+    return ModelManager(progress_bar=False)
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_xtts(tmp_path):
+    """XTTS is too big to run on github actions. We need to test it locally"""
+    args = [
+        "--model_name",
+        "tts_models/multilingual/multi-dataset/xtts_v1.1",
+        "--text",
+        "C'est un exemple.",
+        "--language_idx",
+        "fr",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--no-progress_bar",
+        "--speaker_wav",
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"),
+    ]
+    if torch.cuda.is_available():
+        args.append("--use_cuda")
+    run_main(main, args)
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_xtts_streaming(manager):
+    """Testing the new inference_stream method"""
+    from TTS.tts.configs.xtts_config import XttsConfig
+    from TTS.tts.models.xtts import Xtts
+
+    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
+    speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
+    speaker_wav.append(speaker_wav_2)
+    model_path, _, _ = manager.download_model("tts_models/multilingual/multi-dataset/xtts_v1.1")
+    config = XttsConfig()
+    config.load_json(model_path / "config.json")
+    model = Xtts.init_from_config(config)
+    model.load_checkpoint(config, checkpoint_dir=str(model_path))
+    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+
+    print("Computing speaker latents...")
+    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
+
+    print("Inference...")
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+    )
+    wav_chunks = []
+    for i, chunk in enumerate(chunks):
+        if i == 0:
+            assert chunk.shape[-1] > 5000
+        wav_chunks.append(chunk)
+    assert len(wav_chunks) > 1
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_xtts_v2(tmp_path):
+    """XTTS is too big to run on github actions. We need to test it locally"""
+    args = [
+        "--model_name",
+        "tts_models/multilingual/multi-dataset/xtts_v2",
+        "--text",
+        "C'est un exemple.",
+        "--language_idx",
+        "fr",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--no-progress_bar",
+        "--speaker_wav",
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"),
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav"),
+    ]
+    if torch.cuda.is_available():
+        args.append("--use_cuda")
+    run_main(main, args)
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_xtts_v2_streaming(manager):
+    """Testing the new inference_stream method"""
+    from TTS.tts.configs.xtts_config import XttsConfig
+    from TTS.tts.models.xtts import Xtts
+
+    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
+    model_path, _, _ = manager.download_model("tts_models/multilingual/multi-dataset/xtts_v2")
+    config = XttsConfig()
+    config.load_json(model_path / "config.json")
+    model = Xtts.init_from_config(config)
+    model.load_checkpoint(config, checkpoint_dir=str(model_path))
+    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+
+    print("Computing speaker latents...")
+    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
+
+    print("Inference...")
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+    )
+    wav_chunks = []
+    for i, chunk in enumerate(chunks):
+        if i == 0:
+            assert chunk.shape[-1] > 5000
+        wav_chunks.append(chunk)
+    assert len(wav_chunks) > 1
+    normal_len = sum([len(chunk) for chunk in wav_chunks])
+
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+        speed=1.5,
+    )
+    wav_chunks = []
+    for i, chunk in enumerate(chunks):
+        wav_chunks.append(chunk)
+    fast_len = sum([len(chunk) for chunk in wav_chunks])
+
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+        speed=0.66,
+    )
+    wav_chunks = []
+    for i, chunk in enumerate(chunks):
+        wav_chunks.append(chunk)
+    slow_len = sum([len(chunk) for chunk in wav_chunks])
+
+    assert slow_len > normal_len
+    assert normal_len > fast_len
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_tortoise(tmp_path):
+    args = [
+        "--model_name",
+        "tts_models/en/multi-dataset/tortoise-v2",
+        "--text",
+        "This is an example.",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--no-progress_bar",
+    ]
+    if torch.cuda.is_available():
+        args.append("--use_cuda")
+    run_main(main, args)
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_bark(tmp_path):
+    """Bark is too big to run on github actions. We need to test it locally"""
+    args = [
+        "--model_name",
+        "tts_models/multilingual/multi-dataset/bark",
+        "--text",
+        "This is an example.",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--no-progress_bar",
+    ]
+    if torch.cuda.is_available():
+        args.append("--use_cuda")
+    run_main(main, args)
diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py
index b944423988..9f02672ef1 100644
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@@ -1,12 +1,12 @@
 #!/usr/bin/env python3`
-import glob
 import os
 import shutil
 
-import torch
-from trainer.io import get_user_data_dir
+import pytest
 
-from tests import get_tests_data_path, get_tests_output_path, run_cli
+from tests import get_tests_data_path, run_main
+from TTS.api import TTS
+from TTS.bin.synthesize import main
 from TTS.tts.utils.languages import LanguageManager
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.manage import ModelManager
@@ -19,252 +19,81 @@
 ]
 
 
-def run_models(offset=0, step=1):
-    """Check if all the models are downloadable and tts models run correctly."""
-    print(" > Run synthesizer with all the models.")
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False)
-    model_names = [name for name in manager.list_models() if name not in MODELS_WITH_SEP_TESTS]
-    print("Model names:", model_names)
-    for model_name in model_names[offset::step]:
-        print(f"\n > Run - {model_name}")
-        model_path, _, _ = manager.download_model(model_name)
-        if "tts_models" in model_name:
-            local_download_dir = os.path.dirname(model_path)
-            # download and run the model
-            speaker_files = glob.glob(local_download_dir + "/speaker*")
-            language_files = glob.glob(local_download_dir + "/language*")
-            language_id = ""
-            if len(speaker_files) > 0:
-                # multi-speaker model
-                if "speaker_ids" in speaker_files[0]:
-                    speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0])
-                elif "speakers" in speaker_files[0]:
-                    speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0])
-
-                # multi-lingual model - Assuming multi-lingual models are also multi-speaker
-                if len(language_files) > 0 and "language_ids" in language_files[0]:
-                    language_manager = LanguageManager(language_ids_file_path=language_files[0])
-                    language_id = language_manager.language_names[0]
-
-                speaker_id = list(speaker_manager.name_to_id.keys())[0]
-                run_cli(
-                    f"tts --model_name  {model_name} "
-                    f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" --no-progress_bar'
-                )
-            else:
-                # single-speaker model
-                run_cli(
-                    f"tts --model_name  {model_name} "
-                    f'--text "This is an example." --out_path "{output_path}" --no-progress_bar'
-                )
-            # remove downloaded models
-            shutil.rmtree(local_download_dir)
-            shutil.rmtree(get_user_data_dir("tts"))
-        elif "voice_conversion_models" in model_name:
-            speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
-            reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
-            run_cli(
-                f"tts --model_name  {model_name} "
-                f'--out_path "{output_path}" --source_wav "{speaker_wav}" --target_wav "{reference_wav}" --no-progress_bar'
-            )
-        else:
-            # only download the model
-            manager.download_model(model_name)
-        print(f" | > OK: {model_name}")
-
-
-def test_xtts():
-    """XTTS is too big to run on github actions. We need to test it locally"""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
-    use_gpu = torch.cuda.is_available()
-    if use_gpu:
-        run_cli(
-            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1.1 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda '
-            f'--speaker_wav "{speaker_wav}" --language_idx "en"'
-        )
-    else:
-        run_cli(
-            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1.1 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar '
-            f'--speaker_wav "{speaker_wav}" --language_idx "en"'
-        )
-
-
-def test_xtts_streaming():
-    """Testing the new inference_stream method"""
-    from TTS.tts.configs.xtts_config import XttsConfig
-    from TTS.tts.models.xtts import Xtts
-
-    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
-    speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
-    speaker_wav.append(speaker_wav_2)
-    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1")
-    config = XttsConfig()
-    config.load_json(os.path.join(model_path, "config.json"))
-    model = Xtts.init_from_config(config)
-    model.load_checkpoint(config, checkpoint_dir=model_path)
-    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-
-    print("Computing speaker latents...")
-    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
-
-    print("Inference...")
-    chunks = model.inference_stream(
-        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
-        "en",
-        gpt_cond_latent,
-        speaker_embedding,
-    )
-    wav_chuncks = []
-    for i, chunk in enumerate(chunks):
-        if i == 0:
-            assert chunk.shape[-1] > 5000
-        wav_chuncks.append(chunk)
-    assert len(wav_chuncks) > 1
-
-
-def test_xtts_v2():
-    """XTTS is too big to run on github actions. We need to test it locally"""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
-    speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
-    use_gpu = torch.cuda.is_available()
-    if use_gpu:
-        run_cli(
-            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v2 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda '
-            f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}"  --language_idx "en"'
-        )
+@pytest.fixture(autouse=True)
+def run_around_tests(tmp_path):
+    """Download models to a temp folder and delete it afterwards."""
+    os.environ["TTS_HOME"] = str(tmp_path)
+    yield
+    shutil.rmtree(tmp_path)
+
+
+@pytest.fixture
+def manager(tmp_path):
+    """Set up model manager."""
+    return ModelManager(output_prefix=tmp_path, progress_bar=False)
+
+
+# To split tests into different CI jobs
+num_partitions = int(os.getenv("NUM_PARTITIONS", "1"))
+partition = int(os.getenv("TEST_PARTITION", "0"))
+model_names = [name for name in TTS.list_models() if name not in MODELS_WITH_SEP_TESTS]
+model_names.extend(["tts_models/deu/fairseq/vits", "tts_models/sqi/fairseq/vits"])
+model_names = [name for i, name in enumerate(model_names) if i % num_partitions == partition]
+
+
+@pytest.mark.parametrize("model_name", model_names)
+def test_models(tmp_path, model_name, manager):
+    print(f"\n > Run - {model_name}")
+    output_path = str(tmp_path / "output.wav")
+    model_path, _, _ = manager.download_model(model_name)
+    args = ["--model_name", model_name, "--out_path", output_path, "--no-progress_bar"]
+    if "tts_models" in model_name:
+        local_download_dir = model_path.parent
+        # download and run the model
+        speaker_files = list(local_download_dir.glob("speaker*"))
+        language_files = list(local_download_dir.glob("language*"))
+        speaker_arg = []
+        language_arg = []
+        if len(speaker_files) > 0:
+            # multi-speaker model
+            if "speaker_ids" in speaker_files[0].stem:
+                speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0])
+            elif "speakers" in speaker_files[0].stem:
+                speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0])
+            speakers = list(speaker_manager.name_to_id.keys())
+            if len(speakers) > 1:
+                speaker_arg = ["--speaker_idx", speakers[0]]
+        if len(language_files) > 0 and "language_ids" in language_files[0].stem:
+            # multi-lingual model
+            language_manager = LanguageManager(language_ids_file_path=language_files[0])
+            languages = language_manager.language_names
+            if len(languages) > 1:
+                language_arg = ["--language_idx", languages[0]]
+        run_main(main, [*args, "--text", "This is an example.", *speaker_arg, *language_arg])
+    elif "voice_conversion_models" in model_name:
+        speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
+        reference_wav1 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0028.wav")
+        reference_wav2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
+        run_main(main, [*args, "--source_wav", speaker_wav, "--target_wav", reference_wav1, reference_wav2])
     else:
-        run_cli(
-            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v2 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar '
-            f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"'
-        )
-
-
-def test_xtts_v2_streaming():
-    """Testing the new inference_stream method"""
-    from TTS.tts.configs.xtts_config import XttsConfig
-    from TTS.tts.models.xtts import Xtts
+        # only download the model
+        manager.download_model(model_name)
+    print(f" | > OK: {model_name}")
 
-    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
-    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
-    config = XttsConfig()
-    config.load_json(os.path.join(model_path, "config.json"))
-    model = Xtts.init_from_config(config)
-    model.load_checkpoint(config, checkpoint_dir=model_path)
-    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-
-    print("Computing speaker latents...")
-    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
-
-    print("Inference...")
-    chunks = model.inference_stream(
-        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
-        "en",
-        gpt_cond_latent,
-        speaker_embedding,
-    )
-    wav_chuncks = []
-    for i, chunk in enumerate(chunks):
-        if i == 0:
-            assert chunk.shape[-1] > 5000
-        wav_chuncks.append(chunk)
-    assert len(wav_chuncks) > 1
-    normal_len = sum([len(chunk) for chunk in wav_chuncks])
-
-    chunks = model.inference_stream(
-        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
-        "en",
-        gpt_cond_latent,
-        speaker_embedding,
-        speed=1.5,
-    )
-    wav_chuncks = []
-    for i, chunk in enumerate(chunks):
-        wav_chuncks.append(chunk)
-    fast_len = sum([len(chunk) for chunk in wav_chuncks])
 
-    chunks = model.inference_stream(
-        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
-        "en",
-        gpt_cond_latent,
-        speaker_embedding,
-        speed=0.66,
-    )
-    wav_chuncks = []
-    for i, chunk in enumerate(chunks):
-        wav_chuncks.append(chunk)
-    slow_len = sum([len(chunk) for chunk in wav_chuncks])
-
-    assert slow_len > normal_len
-    assert normal_len > fast_len
-
-
-def test_tortoise():
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    use_gpu = torch.cuda.is_available()
-    if use_gpu:
-        run_cli(
-            f" tts --model_name  tts_models/en/multi-dataset/tortoise-v2 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda'
-        )
-    else:
-        run_cli(
-            f" tts --model_name  tts_models/en/multi-dataset/tortoise-v2 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar'
-        )
-
-
-def test_bark():
-    """Bark is too big to run on github actions. We need to test it locally"""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    use_gpu = torch.cuda.is_available()
-    if use_gpu:
-        run_cli(
-            f" tts --model_name  tts_models/multilingual/multi-dataset/bark "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda'
-        )
-    else:
-        run_cli(
-            f" tts --model_name  tts_models/multilingual/multi-dataset/bark "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar'
-        )
-
-
-def test_voice_conversion():
+def test_voice_conversion(tmp_path):
     print(" > Run voice conversion inference using YourTTS model.")
-    model_name = "tts_models/multilingual/multi-dataset/your_tts"
-    language_id = "en"
-    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
-    reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    run_cli(
-        f"tts --model_name  {model_name}"
-        f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} --no-progress_bar"
-    )
-
-
-"""
-These are used to split tests into different actions on Github.
-"""
-
-
-def test_models_offset_0_step_3():
-    run_models(offset=0, step=3)
-
-
-def test_models_offset_1_step_3():
-    run_models(offset=1, step=3)
-
-
-def test_models_offset_2_step_3():
-    run_models(offset=2, step=3)
+    args = [
+        "--model_name",
+        "tts_models/multilingual/multi-dataset/your_tts",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--speaker_wav",
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"),
+        "--reference_wav",
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav"),
+        "--language_idx",
+        "en",
+        "--no-progress_bar",
+    ]
+    run_main(main, args)