diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 8d639d5dee..7905add3f7 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -37,7 +37,6 @@ jobs:
           sudo apt-get install espeak espeak-ng
       - name: Install dependencies
         run: |
-          sudo apt-get update
           sudo apt-get install -y --no-install-recommends git make gcc
           make system-deps
       - name: Install custom Trainer and/or Coqpit if requested
@@ -68,7 +67,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ["3.9", "3.12"]
-        subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]
+        subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts"]
     steps:
       - uses: actions/checkout@v4
       - name: Setup uv
@@ -76,13 +75,12 @@ jobs:
       - name: Set up Python ${{ matrix.python-version }}
         run: uv python install ${{ matrix.python-version }}
       - name: Install Espeak
-        if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset)
+        if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts"]'), matrix.subset)
         run: |
           sudo apt-get update
           sudo apt-get install espeak espeak-ng
       - name: Install dependencies
         run: |
-          sudo apt-get update
           sudo apt-get install -y --no-install-recommends git make gcc
           make system-deps
       - name: Install custom Trainer and/or Coqpit if requested
@@ -107,9 +105,50 @@ jobs:
           name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }}
           path: .coverage.*
           if-no-files-found: ignore
+  zoo:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.12"]
+        partition: ["0", "1", "2"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup uv
+        uses: ./.github/actions/setup-uv
+      - name: Set up Python ${{ matrix.python-version }}
+        run: uv python install ${{ matrix.python-version }}
+      - name: Install Espeak
+        run: |
+          sudo apt-get update
+          sudo apt-get install espeak espeak-ng
+      - name: Install dependencies
+        run: |
+          sudo apt-get install -y --no-install-recommends git make gcc
+          make system-deps
+      - name: Install custom Trainer and/or Coqpit if requested
+        run: |
+          if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then
+            uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }}
+          fi
+          if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then
+            uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }}
+          fi
+      - name: Zoo tests
+        run: uv run --extra server --extra languages make test_zoo
+        env:
+          NUM_PARTITIONS: 3
+          TEST_PARTITION: ${{ matrix.partition }}
+      - name: Upload coverage data
+        uses: actions/upload-artifact@v4
+        with:
+          include-hidden-files: true
+          name: coverage-data-zoo-${{ matrix.partition }}
+          path: .coverage.*
+          if-no-files-found: ignore
   coverage:
     if: always()
-    needs: [unit, integration]
+    needs: [unit, integration, zoo]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d4a8cf0090..2b3a973763 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,30 +11,25 @@ You can contribute not only with code but with bug reports, comments, questions,
 
 If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers.
 
-- [Development Road Map](https://github.com/coqui-ai/TTS/issues/378)
-
-    You can pick something out of our road map. We keep the progess of the project in this simple issue thread. It has new model proposals or developmental updates etc.
-
 - [Github Issues Tracker](https://github.com/idiap/coqui-ai-TTS/issues)
 
     This is a place to find feature requests, bugs.
 
-    Issues with the ```good first issue``` tag are good place for beginners to take on.
-
-- ✨**PR**✨ [pages](https://github.com/idiap/coqui-ai-TTS/pulls) with the ```🚀new version``` tag.
-
-    We list all the target improvements for the next version. You can pick one of them and start contributing.
+    Issues with the ```good first issue``` tag are good place for beginners to
+    take on. Issues tagged with `help wanted` are suited for more experienced
+    outside contributors.
 
 - Also feel free to suggest new features, ideas and models. We're always open for new things.
 
-## Call for sharing language models
+## Call for sharing pretrained models
 If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified.
 
 This model can be shared in two ways:
 1. Share the model files with us and we serve them with the next 🐸 TTS release.
 2. Upload your models on GDrive and share the link.
 
-Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.
+Models are served under `.models.json` file and any model is available under TTS
+CLI and Python API end points.
 
 Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/discussions/930).
 
@@ -135,7 +130,8 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
 
 13. Let's discuss until it is perfect. 💪
 
-    We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/idiap/coqui-ai-TTS/pulls].
+    We might ask you for certain changes that would appear in the
+    [Github ✨**PR**✨'s page](https://github.com/idiap/coqui-ai-TTS/pulls).
 
 14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
 
@@ -143,9 +139,9 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
 
 If you prefer working within a Docker container as your development environment, you can do the following:
 
-1. Fork 🐸TTS[https://github.com/idiap/coqui-ai-TTS] by clicking the fork button at the top right corner of the project page.
+1. Fork the 🐸TTS [Github repository](https://github.com/idiap/coqui-ai-TTS) by clicking the fork button at the top right corner of the page.
 
-2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
+2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```.
 
     ```bash
     git clone git@github.com:<your Github name>/coqui-ai-TTS.git
diff --git a/Makefile b/Makefile
index 1d6867f5e8..35345b8c1f 100644
--- a/Makefile
+++ b/Makefile
@@ -6,48 +6,41 @@ help:
 
 target_dirs := tests TTS notebooks recipes
 
-test_all:	## run tests and don't stop on an error.
-	nose2 --with-coverage --coverage TTS tests
-	./run_bash_tests.sh
-
 test:	## run tests.
-	coverage run -m nose2 -F -v -B tests
+	coverage run -m pytest -x -v --durations=0 tests
 
 test_vocoder:	## run vocoder tests.
-	coverage run -m nose2 -F -v -B tests.vocoder_tests
+	coverage run -m pytest -x -v --durations=0 tests/vocoder_tests
 
 test_tts:	## run tts tests.
-	coverage run -m nose2 -F -v -B tests.tts_tests
+	coverage run -m pytest -x -v --durations=0 tests/tts_tests
 
 test_tts2:	## run tts tests.
-	coverage run -m nose2 -F -v -B tests.tts_tests2
+	coverage run -m pytest -x -v --durations=0 tests/tts_tests2
 
 test_xtts:
-	coverage run -m nose2 -F -v -B tests.xtts_tests
+	coverage run -m pytest -x -v --durations=0 tests/xtts_tests
 
 test_aux:	## run aux tests.
-	coverage run -m nose2 -F -v -B tests.aux_tests
-	./run_bash_tests.sh
+	coverage run -m pytest -x -v --durations=0 tests/aux_tests
+
+test_zoo:	## run zoo tests.
+	coverage run -m pytest -x -v --durations=0 tests/zoo_tests/test_models.py
 
-test_zoo0:	## run zoo tests.
-	coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_0_step_3 \
-	tests.zoo_tests.test_models.test_voice_conversion
-test_zoo1:	## run zoo tests.
-	coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_1_step_3
-test_zoo2:	## run zoo tests.
-	coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_2_step_3
+test_zoo_big:	## run tests for models that are too big for CI.
+	coverage run -m pytest -x -v --durations=0 tests/zoo_tests/test_big_models.py
 
 inference_tests: ## run inference tests.
-	coverage run -m nose2 -F -v -B tests.inference_tests
+	coverage run -m pytest -x -v --durations=0 tests/inference_tests
 
 data_tests: ## run data tests.
-	coverage run -m nose2 -F -v -B tests.data_tests
+	coverage run -m pytest -x -v --durations=0 tests/data_tests
 
 test_text: ## run text tests.
-	coverage run -m nose2 -F -v -B tests.text_tests
+	coverage run -m pytest -x -v --durations=0 tests/text_tests
 
 test_failed:  ## only run tests failed the last time.
-	coverage run -m nose2 -F -v -B tests
+	coverage run -m pytest -x -v --last-failed tests
 
 style:	## update code style.
 	uv run --only-dev black ${target_dirs}
@@ -59,9 +52,6 @@ lint:	## run linters.
 system-deps:	## install linux system deps
 	sudo apt-get install -y libsndfile1-dev
 
-build-docs: ## build the docs
-	cd docs && make clean && make build
-
 install:	## install 🐸 TTS
 	uv sync --all-extras
 
@@ -70,4 +60,4 @@ install_dev:	## install 🐸 TTS for development.
 	uv run pre-commit install
 
 docs:	## build the docs
-	$(MAKE) -C docs clean && $(MAKE) -C docs html
+	uv run --group docs $(MAKE) -C docs clean && uv run --group docs $(MAKE) -C docs html
diff --git a/README.md b/README.md
index 7dddf3a37b..c0843b731d 100644
--- a/README.md
+++ b/README.md
@@ -1,39 +1,34 @@
+# <img src="https://raw.githubusercontent.com/idiap/coqui-ai-TTS/main/images/coqui-log-green-TTS.png" height="56"/>
 
-## 🐸Coqui TTS News
-- 📣 Fork of the [original, unmaintained repository](https://github.com/coqui-ai/TTS). New PyPI package: [coqui-tts](https://pypi.org/project/coqui-tts)
-- 📣 [OpenVoice](https://github.com/myshell-ai/OpenVoice) models now available for voice conversion.
-- 📣 Prebuilt wheels are now also published for Mac and Windows (in addition to Linux as before) for easier installation across platforms.
-- 📣 ⓍTTSv2 is here with 17 languages and better performance across the board. ⓍTTS can stream with <200ms latency.
-- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech).
-- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/bark.html)
-- 📣 You can use [Fairseq models in ~1100 languages](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
 
-## <img src="https://raw.githubusercontent.com/idiap/coqui-ai-TTS/main/images/coqui-log-green-TTS.png" height="56"/>
-
-
-**🐸TTS is a library for advanced Text-to-Speech generation.**
+**🐸 Coqui TTS is a library for advanced Text-to-Speech generation.**
 
 🚀 Pretrained models in +1100 languages.
 
 🛠️ Tools for training new models and fine-tuning existing models in any language.
 
 📚 Utilities for dataset analysis and curation.
-______________________________________________________________________
 
 [![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/coqui-tts)](https://pypi.org/project/coqui-tts/)
 [![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
-[![PyPI version](https://badge.fury.io/py/coqui-tts.svg)](https://badge.fury.io/py/coqui-tts)
+[![PyPI version](https://badge.fury.io/py/coqui-tts.svg)](https://pypi.org/project/coqui-tts/)
 [![Downloads](https://pepy.tech/badge/coqui-tts)](https://pepy.tech/project/coqui-tts)
 [![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440)
-
-![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/tests.yml/badge.svg)
-![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/docker.yaml/badge.svg)
-![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/style_check.yml/badge.svg)
+[![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/tests.yml/badge.svg)](https://github.com/idiap/coqui-ai-TTS/actions/workflows/tests.yml)
+[![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/docker.yaml/badge.svg)](https://github.com/idiap/coqui-ai-TTS/actions/workflows/docker.yaml)
+[![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/style_check.yml/badge.svg)](https://github.com/idiap/coqui-ai-TTS/actions/workflows/style_check.yml)
 [![Docs](<https://readthedocs.org/projects/coqui-tts/badge/?version=latest&style=plastic>)](https://coqui-tts.readthedocs.io/en/latest/)
 
 </div>
 
-______________________________________________________________________
+## 📣 News
+- **Fork of the [original, unmaintained repository](https://github.com/coqui-ai/TTS). New PyPI package: [coqui-tts](https://pypi.org/project/coqui-tts)**
+- 0.25.0: [OpenVoice](https://github.com/myshell-ai/OpenVoice) models now available for voice conversion.
+- 0.24.2: Prebuilt wheels are now also published for Mac and Windows (in addition to Linux as before) for easier installation across platforms.
+- 0.20.0: XTTSv2 is here with 17 languages and better performance across the board. XTTS can stream with <200ms latency.
+- 0.19.0: XTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech).
+- 0.14.1: You can use [Fairseq models in ~1100 languages](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
 
 ## 💬 Where to ask questions
 Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it.
@@ -63,71 +58,68 @@ repository are also still a useful source of information.
 | 🚀 **Released Models**            | [Standard models](https://github.com/idiap/coqui-ai-TTS/blob/dev/TTS/.models.json) and [Fairseq models in ~1100 languages](https://github.com/idiap/coqui-ai-TTS#example-text-to-speech-using-fairseq-models-in-1100-languages-)|
 
 ## Features
-- High-performance Deep Learning models for Text2Speech tasks. See lists of models below.
-- Fast and efficient model training.
-- Detailed training logs on the terminal and Tensorboard.
-- Support for Multi-speaker TTS.
-- Efficient, flexible, lightweight but feature complete `Trainer API`.
+- High-performance text-to-speech and voice conversion models, see list below.
+- Fast and efficient model training with detailed training logs on the terminal and Tensorboard.
+- Support for multi-speaker and multilingual TTS.
 - Released and ready-to-use models.
-- Tools to curate Text2Speech datasets under```dataset_analysis```.
-- Utilities to use and test your models.
+- Tools to curate TTS datasets under ```dataset_analysis/```.
+- Command line and Python APIs to use and test your models.
 - Modular (but not too much) code base enabling easy implementation of new ideas.
 
 ## Model Implementations
 ### Spectrogram models
-- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
-- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
-- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
-- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
-- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
-- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
-- FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
-- FastSpeech2: [paper](https://arxiv.org/abs/2006.04558)
-- SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
-- Capacitron: [paper](https://arxiv.org/abs/1906.03402)
-- OverFlow: [paper](https://arxiv.org/abs/2211.06892)
-- Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320)
-- Delightful TTS: [paper](https://arxiv.org/abs/2110.12612)
+- [Tacotron](https://arxiv.org/abs/1703.10135), [Tacotron2](https://arxiv.org/abs/1712.05884)
+- [Glow-TTS](https://arxiv.org/abs/2005.11129), [SC-GlowTTS](https://arxiv.org/abs/2104.05557)
+- [Speedy-Speech](https://arxiv.org/abs/2008.03802)
+- [Align-TTS](https://arxiv.org/abs/2003.01950)
+- [FastPitch](https://arxiv.org/pdf/2006.06873.pdf)
+- [FastSpeech](https://arxiv.org/abs/1905.09263), [FastSpeech2](https://arxiv.org/abs/2006.04558)
+- [Capacitron](https://arxiv.org/abs/1906.03402)
+- [OverFlow](https://arxiv.org/abs/2211.06892)
+- [Neural HMM TTS](https://arxiv.org/abs/2108.13320)
+- [Delightful TTS](https://arxiv.org/abs/2110.12612)
 
 ### End-to-End Models
-- ⓍTTS: [blog](https://coqui.ai/blog/tts/open_xtts)
-- VITS: [paper](https://arxiv.org/pdf/2106.06103)
-- 🐸 YourTTS: [paper](https://arxiv.org/abs/2112.02418)
-- 🐢 Tortoise: [orig. repo](https://github.com/neonbjb/tortoise-tts)
-- 🐶 Bark: [orig. repo](https://github.com/suno-ai/bark)
-
-### Attention Methods
-- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
-- Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
-- Graves Attention: [paper](https://arxiv.org/abs/1910.10288)
-- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
-- Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
-- Alignment Network: [paper](https://arxiv.org/abs/2108.10447)
-
-### Speaker Encoder
-- GE2E: [paper](https://arxiv.org/abs/1710.10467)
-- Angular Loss: [paper](https://arxiv.org/pdf/2003.11982.pdf)
+- [XTTS](https://arxiv.org/abs/2406.04904)
+- [VITS](https://arxiv.org/pdf/2106.06103)
+- 🐸[YourTTS](https://arxiv.org/abs/2112.02418)
+- 🐢[Tortoise](https://github.com/neonbjb/tortoise-tts)
+- 🐶[Bark](https://github.com/suno-ai/bark)
 
 ### Vocoders
-- MelGAN: [paper](https://arxiv.org/abs/1910.06711)
-- MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106)
-- ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480)
-- GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646)
-- WaveRNN: [origin](https://github.com/fatchord/WaveRNN/)
-- WaveGrad: [paper](https://arxiv.org/abs/2009.00713)
-- HiFiGAN: [paper](https://arxiv.org/abs/2010.05646)
-- UnivNet: [paper](https://arxiv.org/abs/2106.07889)
+- [MelGAN](https://arxiv.org/abs/1910.06711)
+- [MultiBandMelGAN](https://arxiv.org/abs/2005.05106)
+- [ParallelWaveGAN](https://arxiv.org/abs/1910.11480)
+- [GAN-TTS discriminators](https://arxiv.org/abs/1909.11646)
+- [WaveRNN](https://github.com/fatchord/WaveRNN/)
+- [WaveGrad](https://arxiv.org/abs/2009.00713)
+- [HiFiGAN](https://arxiv.org/abs/2010.05646)
+- [UnivNet](https://arxiv.org/abs/2106.07889)
 
 ### Voice Conversion
-- FreeVC: [paper](https://arxiv.org/abs/2210.15418)
-- OpenVoice: [technical report](https://arxiv.org/abs/2312.01479)
+- [FreeVC](https://arxiv.org/abs/2210.15418)
+- [kNN-VC](https://doi.org/10.21437/Interspeech.2023-419)
+- [OpenVoice](https://arxiv.org/abs/2312.01479)
+
+### Others
+- Attention methods: [Guided Attention](https://arxiv.org/abs/1710.08969),
+  [Forward Backward Decoding](https://arxiv.org/abs/1907.09006),
+  [Graves Attention](https://arxiv.org/abs/1910.10288),
+  [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/),
+  [Dynamic Convolutional Attention](https://arxiv.org/pdf/1910.10288.pdf),
+  [Alignment Network](https://arxiv.org/abs/2108.10447)
+- Speaker encoders: [GE2E](https://arxiv.org/abs/1710.10467),
+  [Angular Loss](https://arxiv.org/pdf/2003.11982.pdf)
 
 You can also help us implement more models.
 
+<!-- start installation -->
 ## Installation
-🐸TTS is tested on Ubuntu 22.04 with **python >= 3.9, < 3.13.**.
 
-If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
+🐸TTS is tested on Ubuntu 24.04 with **python >= 3.9, < 3.13**, but should also
+work on Mac and Windows.
+
+If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the pretrained 🐸TTS models, installing from PyPI is the easiest option.
 
 ```bash
 pip install coqui-tts
@@ -165,24 +157,21 @@ pip install -e .[server,ja]
 
 ### Platforms
 
-If you are on Ubuntu (Debian), you can also run following commands for installation.
+If you are on Ubuntu (Debian), you can also run the following commands for installation.
 
 ```bash
-make system-deps  # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
+make system-deps
 make install
 ```
 
-If you are on Windows, 👑@GuyPaddock wrote installation instructions
-[here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system)
-(note that these are out of date, e.g. you need to have at least Python 3.9).
-
+<!-- end installation -->
 
 ## Docker Image
-You can also try TTS without install with the docker image.
-Simply run the following command and you will be able to run TTS without installing it.
+You can also try out Coqui TTS without installation with the docker image.
+Simply run the following command and you will be able to run TTS:
 
 ```bash
-docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
+docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/idiap/coqui-tts-cpu
 python3 TTS/server/server.py --list_models #To get the list of available models
 python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
 ```
@@ -193,10 +182,10 @@ More details about the docker images (like GPU support) can be found
 
 
 ## Synthesizing speech by 🐸TTS
-
+<!-- start inference -->
 ### 🐍 Python API
 
-#### Running a multi-speaker and multi-lingual model
+#### Multi-speaker and multi-lingual model
 
 ```python
 import torch
@@ -208,47 +197,64 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 # List available 🐸TTS models
 print(TTS().list_models())
 
-# Init TTS
+# Initialize TTS
 tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 
+# List speakers
+print(tts.speakers)
+
 # Run TTS
-# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
-# Text to speech list of amplitude values as output
-wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
-# Text to speech to a file
-tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
+# ❗ XTTS supports both, but many models allow only one of the `speaker` and
+# `speaker_wav` arguments
+
+# TTS with list of amplitude values as output, clone the voice from `speaker_wav`
+wav = tts.tts(
+  text="Hello world!",
+  speaker_wav="my/cloning/audio.wav",
+  language="en"
+)
+
+# TTS to a file, use a preset speaker
+tts.tts_to_file(
+  text="Hello world!",
+  speaker="Craig Gutsy",
+  language="en",
+  file_path="output.wav"
+)
 ```
 
-#### Running a single speaker model
+#### Single speaker model
 
 ```python
-# Init TTS with the target model name
-tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False).to(device)
+# Initialize TTS with the target model name
+tts = TTS("tts_models/de/thorsten/tacotron2-DDC").to(device)
 
 # Run TTS
 tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
-
-# Example voice cloning with YourTTS in English, French and Portuguese
-tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
-tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
-tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
-tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
 ```
 
-#### Example voice conversion
+#### Voice conversion (VC)
 
-Converting the voice in `source_wav` to the voice of `target_wav`
+Converting the voice in `source_wav` to the voice of `target_wav`:
 
 ```python
-tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda")
-tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
+tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda")
+tts.voice_conversion_to_file(
+  source_wav="my/source.wav",
+  target_wav="my/target.wav",
+  file_path="output.wav"
+)
 ```
 
 Other available voice conversion models:
+- `voice_conversion_models/multilingual/multi-dataset/knnvc`
 - `voice_conversion_models/multilingual/multi-dataset/openvoice_v1`
 - `voice_conversion_models/multilingual/multi-dataset/openvoice_v2`
 
-#### Example voice cloning together with the default voice conversion model.
+For more details, see the
+[documentation](https://coqui-tts.readthedocs.io/en/latest/vc.html).
+
+#### Voice cloning by combining single speaker TTS model with the default VC model
 
 This way, you can clone voices by using any model in 🐸TTS. The FreeVC model is
 used for voice conversion after synthesizing speech.
@@ -263,7 +269,7 @@ tts.tts_with_vc_to_file(
 )
 ```
 
-#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
+#### TTS using Fairseq models in ~1100 languages 🤯
 For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
 You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
 and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
@@ -277,147 +283,126 @@ api.tts_to_file(
 )
 ```
 
-### Command-line `tts`
+### Command-line interface `tts`
 
 <!-- begin-tts-readme -->
 
-Synthesize speech on command line.
+Synthesize speech on the command line.
 
 You can either use your trained model or choose a model from the provided list.
 
-If you don't specify any models, then it uses LJSpeech based English model.
-
-#### Single Speaker Models
-
 - List provided models:
 
+  ```sh
+  tts --list_models
   ```
-  $ tts --list_models
-  ```
-
-- Get model info (for both tts_models and vocoder_models):
-
-  - Query by type/name:
-    The model_info_by_name uses the name as it from the --list_models.
-    ```
-    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
-    ```
-    For example:
-    ```
-    $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
-    $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
-    ```
-  - Query by type/idx:
-    The model_query_idx uses the corresponding idx from --list_models.
-
-    ```
-    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
-    ```
 
-    For example:
-
-    ```
-    $ tts --model_info_by_idx tts_models/3
-    ```
+- Get model information. Use the names obtained from `--list_models`.
+  ```sh
+  tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
+  ```
+  For example:
+  ```sh
+  tts --model_info_by_name tts_models/tr/common-voice/glow-tts
+  tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
+  ```
 
-  - Query info for model info by full name:
-    ```
-    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
-    ```
+#### Single speaker models
 
-- Run TTS with default models:
+- Run TTS with the default model (`tts_models/en/ljspeech/tacotron2-DDC`):
 
-  ```
-  $ tts --text "Text for TTS" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" --out_path output/path/speech.wav
   ```
 
 - Run TTS and pipe out the generated TTS wav file data:
 
-  ```
-  $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
+  ```sh
+  tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
   ```
 
 - Run a TTS model with its default vocoder model:
 
-  ```
-  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \
+      --model_name "<model_type>/<language>/<dataset>/<model_name>" \
+      --out_path output/path/speech.wav
   ```
 
   For example:
 
-  ```
-  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \
+      --model_name "tts_models/en/ljspeech/glow-tts" \
+      --out_path output/path/speech.wav
   ```
 
-- Run with specific TTS and vocoder models from the list:
+- Run with specific TTS and vocoder models from the list. Note that not every vocoder is compatible with every TTS model.
 
-  ```
-  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \
+      --model_name "<model_type>/<language>/<dataset>/<model_name>" \
+      --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" \
+      --out_path output/path/speech.wav
   ```
 
   For example:
 
-  ```
-  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \
+      --model_name "tts_models/en/ljspeech/glow-tts" \
+      --vocoder_name "vocoder_models/en/ljspeech/univnet" \
+      --out_path output/path/speech.wav
   ```
 
-- Run your own TTS model (Using Griffin-Lim Vocoder):
+- Run your own TTS model (using Griffin-Lim Vocoder):
 
-  ```
-  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \
+      --model_path path/to/model.pth \
+      --config_path path/to/config.json \
+      --out_path output/path/speech.wav
   ```
 
 - Run your own TTS and Vocoder models:
 
-  ```
-  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
-      --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
+  ```sh
+  tts --text "Text for TTS" \
+      --model_path path/to/model.pth \
+      --config_path path/to/config.json \
+      --out_path output/path/speech.wav \
+      --vocoder_path path/to/vocoder.pth \
+      --vocoder_config_path path/to/vocoder_config.json
   ```
 
-#### Multi-speaker Models
+#### Multi-speaker models
 
-- List the available speakers and choose a <speaker_id> among them:
+- List the available speakers and choose a `<speaker_id>` among them:
 
-  ```
-  $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
+  ```sh
+  tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
   ```
 
 - Run the multi-speaker TTS model with the target speaker ID:
 
-  ```
-  $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
+  ```sh
+  tts --text "Text for TTS." --out_path output/path/speech.wav \
+      --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
   ```
 
 - Run your own multi-speaker TTS model:
 
-  ```
-  $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+  ```sh
+  tts --text "Text for TTS" --out_path output/path/speech.wav \
+      --model_path path/to/model.pth --config_path path/to/config.json \
+      --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
   ```
 
-### Voice Conversion Models
+#### Voice conversion models
 
-```
-$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
+```sh
+tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" \
+    --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
 ```
 
 <!-- end-tts-readme -->
-
-## Directory Structure
-```
-|- notebooks/       (Jupyter Notebooks for model evaluation, parameter selection and data analysis.)
-|- utils/           (common utilities.)
-|- TTS
-    |- bin/             (folder for all the executables.)
-      |- train*.py                  (train your target model.)
-      |- ...
-    |- tts/             (text to speech models)
-        |- layers/          (model layer definitions)
-        |- models/          (model definitions)
-        |- utils/           (model specific utilities.)
-    |- speaker_encoder/ (Speaker Encoder models.)
-        |- (same)
-    |- vocoder/         (Vocoder models.)
-        |- (same)
-    |- vc/         (Voice conversion models.)
-        |- (same)
-```
diff --git a/TTS/.models.json b/TTS/.models.json
index 36654d0555..05c88bef43 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -787,6 +787,22 @@
                     "license": "apache 2.0"
                 }
             },
+            "librispeech100": {
+                "wavlm-hifigan": {
+                    "description": "HiFiGAN vocoder for WavLM features from kNN-VC",
+                    "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan.zip",
+                    "commit": "cfba7e0",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT"
+                },
+                "wavlm-hifigan_prematched": {
+                    "description": "Prematched HiFiGAN vocoder for WavLM features from kNN-VC",
+                    "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan_prematched.zip",
+                    "commit": "cfba7e0",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT"
+                }
+            },
             "ljspeech": {
                 "multiband-melgan": {
                     "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
@@ -927,18 +943,27 @@
                 "freevc24": {
                     "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
                     "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
+                    "default_vocoder": null,
                     "author": "Jing-Yi Li @OlaWod",
                     "license": "MIT",
                     "commit": null
                 }
             },
             "multi-dataset": {
+                "knnvc": {
+                    "description": "kNN-VC model from https://github.com/bshall/knn-vc",
+                    "default_vocoder": "vocoder_models/en/librispeech100/wavlm-hifigan_prematched",
+                    "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5",
+                    "license": "MIT",
+                    "commit": null
+                },
                 "openvoice_v1": {
                     "hf_url": [
                         "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/config.json",
                         "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth"
                     ],
                     "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
+                    "default_vocoder": null,
                     "author": "MyShell.ai",
                     "license": "MIT",
                     "commit": null
@@ -949,6 +974,7 @@
                         "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/checkpoint.pth"
                     ],
                     "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
+                    "default_vocoder": null,
                     "author": "MyShell.ai",
                     "license": "MIT",
                     "commit": null
diff --git a/TTS/api.py b/TTS/api.py
index 83189482cb..6db929411c 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -4,7 +4,7 @@
 import tempfile
 import warnings
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 from torch import nn
 
@@ -77,8 +77,8 @@ def __init__(
         super().__init__()
         self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar)
         self.config = load_config(config_path) if config_path else None
-        self.synthesizer = None
-        self.voice_converter = None
+        self.synthesizer: Optional[Synthesizer] = None
+        self.voice_converter: Optional[Synthesizer] = None
         self.model_name = ""
 
         self.vocoder_path = vocoder_path
@@ -95,7 +95,7 @@ def __init__(
             if "tts_models" in model_name:
                 self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
             elif "voice_conversion_models" in model_name:
-                self.load_vc_model_by_name(model_name, gpu=gpu)
+                self.load_vc_model_by_name(model_name, vocoder_name, gpu=gpu)
             # To allow just TTS("xtts")
             else:
                 self.load_model_by_name(model_name, vocoder_name, gpu=gpu)
@@ -157,22 +157,24 @@ def list_models() -> list[str]:
 
     def download_model_by_name(
         self, model_name: str, vocoder_name: Optional[str] = None
-    ) -> tuple[Optional[str], Optional[str], Optional[str]]:
+    ) -> tuple[Optional[Path], Optional[Path], Optional[Path], Optional[Path], Optional[Path]]:
         model_path, config_path, model_item = self.manager.download_model(model_name)
         if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
             # return model directory if there are multiple files
             # we assume that the model knows how to load itself
-            return None, None, model_path
+            return None, None, None, None, model_path
         if model_item.get("default_vocoder") is None:
-            return model_path, config_path, None
+            return model_path, config_path, None, None, None
         if vocoder_name is None:
             vocoder_name = model_item["default_vocoder"]
-        vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
-        # A local vocoder model will take precedence if specified via vocoder_path
-        if self.vocoder_path is None or self.vocoder_config_path is None:
-            self.vocoder_path = vocoder_path
-            self.vocoder_config_path = vocoder_config_path
-        return model_path, config_path, None
+        vocoder_path, vocoder_config_path = None, None
+        # A local vocoder model will take precedence if already specified in __init__
+        if model_item["model_type"] == "tts_models":
+            vocoder_path = self.vocoder_path
+            vocoder_config_path = self.vocoder_config_path
+        if vocoder_path is None or vocoder_config_path is None:
+            vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
+        return model_path, config_path, vocoder_path, vocoder_config_path, None
 
     def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
         """Load one of the 🐸TTS models by name.
@@ -183,7 +185,7 @@ def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None
         """
         self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
 
-    def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False) -> None:
+    def load_vc_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
         """Load one of the voice conversion models by name.
 
         Args:
@@ -191,9 +193,16 @@ def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False) -> None:
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
         self.model_name = model_name
-        model_path, config_path, model_dir = self.download_model_by_name(model_name)
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name, vocoder_name
+        )
         self.voice_converter = Synthesizer(
-            vc_checkpoint=model_path, vc_config=config_path, model_dir=model_dir, use_cuda=gpu
+            vc_checkpoint=model_path,
+            vc_config=config_path,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
+            model_dir=model_dir,
+            use_cuda=gpu,
         )
 
     def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
@@ -208,7 +217,9 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
         self.synthesizer = None
         self.model_name = model_name
 
-        model_path, config_path, model_dir = self.download_model_by_name(model_name, vocoder_name)
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name, vocoder_name
+        )
 
         # init synthesizer
         # None values are fetch from the model
@@ -217,8 +228,8 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
             tts_config_path=config_path,
             tts_speakers_file=None,
             tts_languages_file=None,
-            vocoder_checkpoint=self.vocoder_path,
-            vocoder_config=self.vocoder_config_path,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
             encoder_checkpoint=self.encoder_path,
             encoder_config=self.encoder_config_path,
             model_dir=model_dir,
@@ -273,11 +284,11 @@ def _check_arguments(
     def tts(
         self,
         text: str,
-        speaker: str = None,
-        language: str = None,
-        speaker_wav: str = None,
-        emotion: str = None,
-        speed: float = None,
+        speaker: Optional[str] = None,
+        language: Optional[str] = None,
+        speaker_wav: Optional[str] = None,
+        emotion: Optional[str] = None,
+        speed: Optional[float] = None,
         split_sentences: bool = True,
         **kwargs,
     ):
@@ -322,10 +333,10 @@ def tts(
     def tts_to_file(
         self,
         text: str,
-        speaker: str = None,
-        language: str = None,
-        speaker_wav: str = None,
-        emotion: str = None,
+        speaker: Optional[str] = None,
+        language: Optional[str] = None,
+        speaker_wav: Optional[str] = None,
+        emotion: Optional[str] = None,
         speed: float = 1.0,
         pipe_out=None,
         file_path: str = "output.wav",
@@ -377,7 +388,7 @@ def tts_to_file(
     def voice_conversion(
         self,
         source_wav: str,
-        target_wav: str,
+        target_wav: Union[str, list[str]],
     ):
         """Voice conversion with FreeVC. Convert source wav to target speaker.
 
@@ -395,7 +406,7 @@ def voice_conversion(
     def voice_conversion_to_file(
         self,
         source_wav: str,
-        target_wav: str,
+        target_wav: Union[str, list[str]],
         file_path: str = "output.wav",
         pipe_out=None,
     ) -> str:
@@ -418,9 +429,10 @@ def voice_conversion_to_file(
     def tts_with_vc(
         self,
         text: str,
-        language: str = None,
-        speaker_wav: str = None,
-        speaker: str = None,
+        *,
+        language: Optional[str] = None,
+        speaker_wav: Union[str, list[str]],
+        speaker: Optional[str] = None,
         split_sentences: bool = True,
     ):
         """Convert text to speech with voice conversion.
@@ -460,10 +472,11 @@ def tts_with_vc(
     def tts_with_vc_to_file(
         self,
         text: str,
-        language: str = None,
-        speaker_wav: str = None,
+        *,
+        language: Optional[str] = None,
+        speaker_wav: Union[str, list[str]],
         file_path: str = "output.wav",
-        speaker: str = None,
+        speaker: Optional[str] = None,
         split_sentences: bool = True,
         pipe_out=None,
     ) -> str:
diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py
index 535182d214..8d7a2633a0 100644
--- a/TTS/bin/compute_attention_masks.py
+++ b/TTS/bin/compute_attention_masks.py
@@ -2,6 +2,7 @@
 import importlib
 import logging
 import os
+import sys
 from argparse import RawTextHelpFormatter
 
 import numpy as np
@@ -18,7 +19,7 @@
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     # pylint: disable=bad-option-value
     parser = argparse.ArgumentParser(
@@ -112,7 +113,7 @@
 
     # compute attentions
     file_paths = []
-    with torch.no_grad():
+    with torch.inference_mode():
         for data in tqdm(loader):
             # setup input data
             text_input = data[0]
diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
index 1bdb8d733c..f103350912 100644
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@@ -1,7 +1,9 @@
 import argparse
 import logging
 import os
+import sys
 from argparse import RawTextHelpFormatter
+from typing import Optional
 
 import torch
 from tqdm import tqdm
@@ -14,6 +16,88 @@
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 
+def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
+        """
+        Example runs:
+        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
+
+        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
+        """,
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        help="Path to model checkpoint file. It defaults to the released speaker encoder.",
+        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+    )
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        help="Path to model config file. It defaults to the released speaker encoder config.",
+        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+    )
+    parser.add_argument(
+        "--config_dataset_path",
+        type=str,
+        help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
+        default=None,
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        help="Path for output `pth` or `json` file.",
+        default="speakers.pth",
+    )
+    parser.add_argument(
+        "--old_file",
+        type=str,
+        help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
+        default=None,
+    )
+    parser.add_argument(
+        "--old_append",
+        help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
+    parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
+    parser.add_argument(
+        "--formatter_name",
+        type=str,
+        help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        help="Path to the dataset. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--meta_file_train",
+        type=str,
+        help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--meta_file_val",
+        type=str,
+        help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    return parser.parse_args()
+
+
 def compute_embeddings(
     model_path,
     config_path,
@@ -101,88 +185,9 @@ def compute_embeddings(
         print("Speaker embeddings saved at:", mapping_file_path)
 
 
-if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-
-    parser = argparse.ArgumentParser(
-        description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
-        """
-        Example runs:
-        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
-
-        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
-        """,
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        help="Path to model checkpoint file. It defaults to the released speaker encoder.",
-        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
-    )
-    parser.add_argument(
-        "--config_path",
-        type=str,
-        help="Path to model config file. It defaults to the released speaker encoder config.",
-        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
-    )
-    parser.add_argument(
-        "--config_dataset_path",
-        type=str,
-        help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
-        default=None,
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        help="Path for output `pth` or `json` file.",
-        default="speakers.pth",
-    )
-    parser.add_argument(
-        "--old_file",
-        type=str,
-        help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
-        default=None,
-    )
-    parser.add_argument(
-        "--old_append",
-        help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
-        default=False,
-        action="store_true",
-    )
-    parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
-    parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
-    parser.add_argument(
-        "--formatter_name",
-        type=str,
-        help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--dataset_path",
-        type=str,
-        help="Path to the dataset. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--meta_file_train",
-        type=str,
-        help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--meta_file_val",
-        type=str,
-        help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    args = parser.parse_args()
+def main(arg_list: Optional[list[str]] = None):
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
+    args = parse_args(arg_list)
 
     compute_embeddings(
         args.model_path,
@@ -199,3 +204,7 @@ def compute_embeddings(
         disable_cuda=args.disable_cuda,
         no_eval=args.no_eval,
     )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py
index dc5423a691..b7c52ac6c5 100755
--- a/TTS/bin/compute_statistics.py
+++ b/TTS/bin/compute_statistics.py
@@ -5,6 +5,8 @@
 import glob
 import logging
 import os
+import sys
+from typing import Optional
 
 import numpy as np
 from tqdm import tqdm
@@ -16,10 +18,7 @@
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 
-def main():
-    """Run preprocessing process."""
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-
+def parse_args(arg_list: Optional[list[str]]) -> tuple[argparse.Namespace, list[str]]:
     parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
     parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
     parser.add_argument("out_path", type=str, help="save path (directory and filename).")
@@ -29,7 +28,13 @@ def main():
         required=False,
         help="folder including the target set of wavs overriding dataset config.",
     )
-    args, overrides = parser.parse_known_args()
+    return parser.parse_known_args(arg_list)
+
+
+def main(arg_list: Optional[list[str]] = None):
+    """Run preprocessing process."""
+    setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter())
+    args, overrides = parse_args(arg_list)
 
     CONFIG = load_config(args.config_path)
     CONFIG.parse_known_args(overrides, relaxed_parser=True)
@@ -94,6 +99,7 @@ def main():
     stats["audio_config"] = CONFIG.audio.to_dict()
     np.save(output_file_path, stats, allow_pickle=True)
     print(f" > stats saved to {output_file_path}")
+    sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
index 711c8221db..701c7d8e82 100644
--- a/TTS/bin/eval_encoder.py
+++ b/TTS/bin/eval_encoder.py
@@ -1,5 +1,6 @@
 import argparse
 import logging
+import sys
 from argparse import RawTextHelpFormatter
 
 import torch
@@ -53,7 +54,7 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):
 
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     parser = argparse.ArgumentParser(
         description="""Compute the accuracy of the encoder.\n\n"""
diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
index 86a4dce177..77072f9efa 100755
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@@ -3,7 +3,9 @@
 
 import argparse
 import logging
-import os
+import sys
+from pathlib import Path
+from typing import Optional
 
 import numpy as np
 import torch
@@ -12,8 +14,10 @@
 from trainer.generic_utils import count_parameters
 
 from TTS.config import load_config
+from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.datasets import TTSDataset, load_tts_samples
 from TTS.tts.models import setup_model
+from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
@@ -23,56 +27,66 @@
 use_cuda = torch.cuda.is_available()
 
 
-def setup_loader(ap, r):
-    tokenizer, _ = TTSTokenizer.init_from_config(c)
+def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
+    parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
+    parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
+    parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
+    parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
+    parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
+    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
+    return parser.parse_args(arg_list)
+
+
+def setup_loader(config: BaseTTSConfig, ap: AudioProcessor, r, speaker_manager: SpeakerManager, samples) -> DataLoader:
+    tokenizer, _ = TTSTokenizer.init_from_config(config)
     dataset = TTSDataset(
         outputs_per_step=r,
         compute_linear_spec=False,
-        samples=meta_data,
+        samples=samples,
         tokenizer=tokenizer,
         ap=ap,
         batch_group_size=0,
-        min_text_len=c.min_text_len,
-        max_text_len=c.max_text_len,
-        min_audio_len=c.min_audio_len,
-        max_audio_len=c.max_audio_len,
-        phoneme_cache_path=c.phoneme_cache_path,
+        min_text_len=config.min_text_len,
+        max_text_len=config.max_text_len,
+        min_audio_len=config.min_audio_len,
+        max_audio_len=config.max_audio_len,
+        phoneme_cache_path=config.phoneme_cache_path,
         precompute_num_workers=0,
         use_noise_augment=False,
-        speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
-        d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
+        speaker_id_mapping=speaker_manager.name_to_id if config.use_speaker_embedding else None,
+        d_vector_mapping=speaker_manager.embeddings if config.use_d_vector_file else None,
     )
 
-    if c.use_phonemes and c.compute_input_seq_cache:
+    if config.use_phonemes and config.compute_input_seq_cache:
         # precompute phonemes to have a better estimate of sequence lengths.
-        dataset.compute_input_seq(c.num_loader_workers)
+        dataset.compute_input_seq(config.num_loader_workers)
     dataset.preprocess_samples()
 
-    loader = DataLoader(
+    return DataLoader(
         dataset,
-        batch_size=c.batch_size,
+        batch_size=config.batch_size,
         shuffle=False,
         collate_fn=dataset.collate_fn,
         drop_last=False,
         sampler=None,
-        num_workers=c.num_loader_workers,
+        num_workers=config.num_loader_workers,
         pin_memory=False,
     )
-    return loader
 
 
-def set_filename(wav_path, out_path):
-    wav_file = os.path.basename(wav_path)
-    file_name = wav_file.split(".")[0]
-    os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
-    os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
-    os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
-    os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
-    wavq_path = os.path.join(out_path, "quant", file_name)
-    mel_path = os.path.join(out_path, "mel", file_name)
-    wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
-    wav_path = os.path.join(out_path, "wav", file_name + ".wav")
-    return file_name, wavq_path, mel_path, wav_gl_path, wav_path
+def set_filename(wav_path: str, out_path: Path) -> tuple[Path, Path, Path, Path]:
+    wav_name = Path(wav_path).stem
+    (out_path / "quant").mkdir(exist_ok=True, parents=True)
+    (out_path / "mel").mkdir(exist_ok=True, parents=True)
+    (out_path / "wav_gl").mkdir(exist_ok=True, parents=True)
+    (out_path / "wav").mkdir(exist_ok=True, parents=True)
+    wavq_path = out_path / "quant" / wav_name
+    mel_path = out_path / "mel" / wav_name
+    wav_gl_path = out_path / "wav_gl" / f"{wav_name}.wav"
+    out_wav_path = out_path / "wav" / f"{wav_name}.wav"
+    return wavq_path, mel_path, wav_gl_path, out_wav_path
 
 
 def format_data(data):
@@ -114,18 +128,18 @@ def format_data(data):
     )
 
 
-@torch.no_grad()
+@torch.inference_mode()
 def inference(
-    model_name,
-    model,
-    ap,
+    model_name: str,
+    model: BaseTTS,
+    ap: AudioProcessor,
     text_input,
     text_lengths,
     mel_input,
     mel_lengths,
     speaker_ids=None,
     d_vectors=None,
-):
+) -> np.ndarray:
     if model_name == "glow_tts":
         speaker_c = None
         if speaker_ids is not None:
@@ -140,9 +154,9 @@ def inference(
             aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
         )
         model_output = outputs["model_outputs"]
-        model_output = model_output.detach().cpu().numpy()
+        return model_output.detach().cpu().numpy()
 
-    elif "tacotron" in model_name:
+    if "tacotron" in model_name:
         aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
         outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
         postnet_outputs = outputs["model_outputs"]
@@ -153,16 +167,24 @@ def inference(
             for b in range(postnet_outputs.shape[0]):
                 postnet_output = postnet_outputs[b]
                 mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
-            model_output = torch.stack(mel_specs).cpu().numpy()
-
-        elif model_name == "tacotron2":
-            model_output = postnet_outputs.detach().cpu().numpy()
-    return model_output
+            return torch.stack(mel_specs).cpu().numpy()
+        if model_name == "tacotron2":
+            return postnet_outputs.detach().cpu().numpy()
+    msg = f"Model not supported: {model_name}"
+    raise ValueError(msg)
 
 
 def extract_spectrograms(
-    data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
-):
+    model_name: str,
+    data_loader: DataLoader,
+    model: BaseTTS,
+    ap: AudioProcessor,
+    output_path: Path,
+    quantize_bits: int = 0,
+    save_audio: bool = False,
+    debug: bool = False,
+    metadata_name: str = "metadata.txt",
+) -> None:
     model.eval()
     export_metadata = []
     for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
@@ -181,7 +203,7 @@ def extract_spectrograms(
         ) = format_data(data)
 
         model_output = inference(
-            c.model.lower(),
+            model_name,
             model,
             ap,
             text_input,
@@ -195,7 +217,7 @@ def extract_spectrograms(
         for idx in range(text_input.shape[0]):
             wav_file_path = item_idx[idx]
             wav = ap.load_wav(wav_file_path)
-            _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
+            wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
 
             # quantize and save wav
             if quantize_bits > 0:
@@ -217,74 +239,67 @@ def extract_spectrograms(
                 wav = ap.inv_melspectrogram(mel)
                 ap.save_wav(wav, wav_gl_path)
 
-    with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
+    with (output_path / metadata_name).open("w") as f:
         for data in export_metadata:
-            f.write(f"{data[0]}|{data[1]+'.npy'}\n")
+            f.write(f"{data[0] / data[1]}.npy\n")
 
 
-def main(args):  # pylint: disable=redefined-outer-name
-    # pylint: disable=global-variable-undefined
-    global meta_data, speaker_manager
+def main(arg_list: Optional[list[str]] = None) -> None:
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
+    args = parse_args(arg_list)
+    config = load_config(args.config_path)
+    config.audio.trim_silence = False
 
     # Audio processor
-    ap = AudioProcessor(**c.audio)
+    ap = AudioProcessor(**config.audio)
 
     # load data instances
     meta_data_train, meta_data_eval = load_tts_samples(
-        c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+        config.datasets,
+        eval_split=args.eval,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
     )
 
     # use eval and training partitions
     meta_data = meta_data_train + meta_data_eval
 
     # init speaker manager
-    if c.use_speaker_embedding:
+    if config.use_speaker_embedding:
         speaker_manager = SpeakerManager(data_items=meta_data)
-    elif c.use_d_vector_file:
-        speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
+    elif config.use_d_vector_file:
+        speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file)
     else:
         speaker_manager = None
 
     # setup model
-    model = setup_model(c)
+    model = setup_model(config)
 
     # restore model
-    model.load_checkpoint(c, args.checkpoint_path, eval=True)
+    model.load_checkpoint(config, args.checkpoint_path, eval=True)
 
     if use_cuda:
         model.cuda()
 
     num_params = count_parameters(model)
-    print("\n > Model has {} parameters".format(num_params), flush=True)
+    print(f"\n > Model has {num_params} parameters", flush=True)
     # set r
-    r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
-    own_loader = setup_loader(ap, r)
+    r = 1 if config.model.lower() == "glow_tts" else model.decoder.r
+    own_loader = setup_loader(config, ap, r, speaker_manager, meta_data)
 
     extract_spectrograms(
+        config.model.lower(),
         own_loader,
         model,
         ap,
-        args.output_path,
+        Path(args.output_path),
         quantize_bits=args.quantize_bits,
         save_audio=args.save_audio,
         debug=args.debug,
-        metada_name="metada.txt",
+        metadata_name="metadata.txt",
     )
+    sys.exit(0)
 
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
-    parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
-    parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
-    parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
-    parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
-    parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
-    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
-    args = parser.parse_args()
-
-    c = load_config(args.config_path)
-    c.audio.trim_silence = False
-    main(args)
+    main()
diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py
index 0519d43769..7a7fdf5dd4 100644
--- a/TTS/bin/find_unique_chars.py
+++ b/TTS/bin/find_unique_chars.py
@@ -2,6 +2,7 @@
 
 import argparse
 import logging
+import sys
 from argparse import RawTextHelpFormatter
 
 from TTS.config import load_config
@@ -10,7 +11,7 @@
 
 
 def main():
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     # pylint: disable=bad-option-value
     parser = argparse.ArgumentParser(
diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py
index d99acb9893..0c453db85b 100644
--- a/TTS/bin/find_unique_phonemes.py
+++ b/TTS/bin/find_unique_phonemes.py
@@ -1,9 +1,11 @@
-"""Find all the unique characters in a dataset"""
+"""Find all the unique characters in a dataset."""
 
 import argparse
 import logging
 import multiprocessing
+import sys
 from argparse import RawTextHelpFormatter
+from typing import Optional
 
 from tqdm.contrib.concurrent import process_map
 
@@ -13,18 +15,13 @@
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 
-def compute_phonemes(item):
+def compute_phonemes(item: dict) -> set[str]:
     text = item["text"]
     ph = phonemizer.phonemize(text).replace("|", "")
     return set(ph)
 
 
-def main():
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-
-    # pylint: disable=W0601
-    global c, phonemizer
-    # pylint: disable=bad-option-value
+def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="""Find all the unique characters or phonemes in a dataset.\n\n"""
         """
@@ -35,13 +32,21 @@ def main():
         formatter_class=RawTextHelpFormatter,
     )
     parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
-    args = parser.parse_args()
+    return parser.parse_args(arg_list)
+
 
-    c = load_config(args.config_path)
+def main(arg_list: Optional[list[str]] = None) -> None:
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
+    global phonemizer
+    args = parse_args(arg_list)
+    config = load_config(args.config_path)
 
     # load all datasets
     train_items, eval_items = load_tts_samples(
-        c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+        config.datasets,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
     )
     items = train_items + eval_items
     print("Num items:", len(items))
@@ -49,13 +54,16 @@ def main():
     language_list = [item["language"] for item in items]
     is_lang_def = all(language_list)
 
-    if not c.phoneme_language or not is_lang_def:
-        raise ValueError("Phoneme language must be defined in config.")
+    if not config.phoneme_language or not is_lang_def:
+        msg = "Phoneme language must be defined in config."
+        raise ValueError(msg)
 
-    if not language_list.count(language_list[0]) == len(language_list):
-        raise ValueError(
-            "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
+    if language_list.count(language_list[0]) != len(language_list):
+        msg = (
+            "Currently, just one phoneme language per config file is supported !! "
+            "Please split the dataset config into different configs and run it individually for each language !!"
         )
+        raise ValueError(msg)
 
     phonemizer = Gruut(language=language_list[0], keep_puncs=True)
 
@@ -73,6 +81,7 @@ def main():
     print(f" > Unique phonemes: {''.join(sorted(phones))}")
     print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
     print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
+    sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py
index edab882db8..f9121d7f77 100755
--- a/TTS/bin/remove_silence_using_vad.py
+++ b/TTS/bin/remove_silence_using_vad.py
@@ -4,6 +4,7 @@
 import multiprocessing
 import os
 import pathlib
+import sys
 
 import torch
 from tqdm import tqdm
@@ -77,7 +78,7 @@ def preprocess_audios():
 
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     parser = argparse.ArgumentParser(
         description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end"
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index 885f6d6f0c..f963485c5d 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -7,6 +7,7 @@
 import logging
 import sys
 from argparse import RawTextHelpFormatter
+from typing import Optional
 
 # pylint: disable=redefined-outer-name, unused-argument
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
@@ -14,128 +15,127 @@
 logger = logging.getLogger(__name__)
 
 description = """
-Synthesize speech on command line.
+Synthesize speech on the command line.
 
 You can either use your trained model or choose a model from the provided list.
 
-If you don't specify any models, then it uses LJSpeech based English model.
-
-#### Single Speaker Models
-
 - List provided models:
 
+  ```sh
+  tts --list_models
   ```
-  $ tts --list_models
-  ```
-
-- Get model info (for both tts_models and vocoder_models):
-
-  - Query by type/name:
-    The model_info_by_name uses the name as it from the --list_models.
-    ```
-    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
-    ```
-    For example:
-    ```
-    $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
-    $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
-    ```
-  - Query by type/idx:
-    The model_query_idx uses the corresponding idx from --list_models.
-
-    ```
-    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
-    ```
 
-    For example:
-
-    ```
-    $ tts --model_info_by_idx tts_models/3
-    ```
+- Get model information. Use the names obtained from `--list_models`.
+  ```sh
+  tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
+  ```
+  For example:
+  ```sh
+  tts --model_info_by_name tts_models/tr/common-voice/glow-tts
+  tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
+  ```
 
-  - Query info for model info by full name:
-    ```
-    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
-    ```
+#### Single speaker models
 
-- Run TTS with default models:
+- Run TTS with the default model (`tts_models/en/ljspeech/tacotron2-DDC`):
 
-  ```
-  $ tts --text "Text for TTS" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" --out_path output/path/speech.wav
   ```
 
 - Run TTS and pipe out the generated TTS wav file data:
 
-  ```
-  $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
+  ```sh
+  tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
   ```
 
 - Run a TTS model with its default vocoder model:
 
-  ```
-  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \\
+      --model_name "<model_type>/<language>/<dataset>/<model_name>" \\
+      --out_path output/path/speech.wav
   ```
 
   For example:
 
-  ```
-  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \\
+      --model_name "tts_models/en/ljspeech/glow-tts" \\
+      --out_path output/path/speech.wav
   ```
 
-- Run with specific TTS and vocoder models from the list:
+- Run with specific TTS and vocoder models from the list. Note that not every vocoder is compatible with every TTS model.
 
-  ```
-  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \\
+      --model_name "<model_type>/<language>/<dataset>/<model_name>" \\
+      --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" \\
+      --out_path output/path/speech.wav
   ```
 
   For example:
 
-  ```
-  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \\
+      --model_name "tts_models/en/ljspeech/glow-tts" \\
+      --vocoder_name "vocoder_models/en/ljspeech/univnet" \\
+      --out_path output/path/speech.wav
   ```
 
-- Run your own TTS model (Using Griffin-Lim Vocoder):
+- Run your own TTS model (using Griffin-Lim Vocoder):
 
-  ```
-  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+  ```sh
+  tts --text "Text for TTS" \\
+      --model_path path/to/model.pth \\
+      --config_path path/to/config.json \\
+      --out_path output/path/speech.wav
   ```
 
 - Run your own TTS and Vocoder models:
 
-  ```
-  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
-      --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
+  ```sh
+  tts --text "Text for TTS" \\
+      --model_path path/to/model.pth \\
+      --config_path path/to/config.json \\
+      --out_path output/path/speech.wav \\
+      --vocoder_path path/to/vocoder.pth \\
+      --vocoder_config_path path/to/vocoder_config.json
   ```
 
-#### Multi-speaker Models
+#### Multi-speaker models
 
-- List the available speakers and choose a <speaker_id> among them:
+- List the available speakers and choose a `<speaker_id>` among them:
 
-  ```
-  $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
+  ```sh
+  tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
   ```
 
 - Run the multi-speaker TTS model with the target speaker ID:
 
-  ```
-  $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
+  ```sh
+  tts --text "Text for TTS." --out_path output/path/speech.wav \\
+      --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
   ```
 
 - Run your own multi-speaker TTS model:
 
-  ```
-  $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+  ```sh
+  tts --text "Text for TTS" --out_path output/path/speech.wav \\
+      --model_path path/to/model.pth --config_path path/to/config.json \\
+      --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
   ```
 
-### Voice Conversion Models
+#### Voice conversion models
 
-```
-$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
+```sh
+tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" \\
+    --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
 ```
 """
 
 
-def parse_args() -> argparse.Namespace:
+def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace:
     """Parse arguments."""
     parser = argparse.ArgumentParser(
         description=description.replace("    ```\n", ""),
@@ -275,13 +275,14 @@ def parse_args() -> argparse.Namespace:
         "--source_wav",
         type=str,
         default=None,
-        help="Original audio file to convert in the voice of the target_wav",
+        help="Original audio file to convert into the voice of the target_wav",
     )
     parser.add_argument(
         "--target_wav",
         type=str,
+        nargs="*",
         default=None,
-        help="Target audio file to convert in the voice of the source_wav",
+        help="Audio file(s) of the target voice into which to convert the source_wav",
     )
 
     parser.add_argument(
@@ -291,7 +292,7 @@ def parse_args() -> argparse.Namespace:
         help="Voice dir for tortoise model",
     )
 
-    args = parser.parse_args()
+    args = parser.parse_args(arg_list)
 
     # print the description if either text or list_models is not set
     check_args = [
@@ -310,10 +311,11 @@ def parse_args() -> argparse.Namespace:
     return args
 
 
-def main() -> None:
+def main(arg_list: Optional[list[str]] = None) -> None:
     """Entry point for `tts` command line interface."""
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-    args = parse_args()
+    args = parse_args(arg_list)
+    stream = sys.stderr if args.pipe_out else sys.stdout
+    setup_logger("TTS", level=logging.INFO, stream=stream, formatter=ConsoleFormatter())
 
     pipe_out = sys.stdout if args.pipe_out else None
 
@@ -340,18 +342,18 @@ def main() -> None:
         # 1) List pre-trained TTS models
         if args.list_models:
             manager.list_models()
-            sys.exit()
+            sys.exit(0)
 
         # 2) Info about pre-trained TTS models (without loading a model)
         if args.model_info_by_idx:
             model_query = args.model_info_by_idx
             manager.model_info_by_idx(model_query)
-            sys.exit()
+            sys.exit(0)
 
         if args.model_info_by_name:
             model_query_full_name = args.model_info_by_name
             manager.model_info_by_full_name(model_query_full_name)
-            sys.exit()
+            sys.exit(0)
 
         # 3) Load a model for further info or TTS/VC
         device = args.device
@@ -377,23 +379,23 @@ def main() -> None:
         if args.list_speaker_idxs:
             if not api.is_multi_speaker:
                 logger.info("Model only has a single speaker.")
-                return
+                sys.exit(0)
             logger.info(
                 "Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
             )
             logger.info(api.speakers)
-            return
+            sys.exit(0)
 
         # query langauge ids of a multi-lingual model.
         if args.list_language_idxs:
             if not api.is_multi_lingual:
                 logger.info("Monolingual model.")
-                return
+                sys.exit(0)
             logger.info(
                 "Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
             )
             logger.info(api.languages)
-            return
+            sys.exit(0)
 
         # check the arguments against a multi-speaker model.
         if api.is_multi_speaker and (not args.speaker_idx and not args.speaker_wav):
@@ -401,7 +403,7 @@ def main() -> None:
                 "Looks like you use a multi-speaker model. Define `--speaker_idx` to "
                 "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
             )
-            return
+            sys.exit(1)
 
         # RUN THE SYNTHESIS
         if args.text:
@@ -430,6 +432,7 @@ def main() -> None:
                 pipe_out=pipe_out,
             )
             logger.info("Saved VC output to %s", args.out_path)
+        sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
index ba03c42b6d..a37ab8efc9 100644
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@@ -87,7 +87,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False):
 def evaluation(model, criterion, data_loader, global_step):
     eval_loss = 0
     for _, data in enumerate(data_loader):
-        with torch.no_grad():
+        with torch.inference_mode():
             # setup input data
             inputs, labels = data
 
@@ -322,7 +322,7 @@ def main(args):  # pylint: disable=redefined-outer-name
 
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
 
diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py
index 6d6342a762..e93b1c9d24 100644
--- a/TTS/bin/train_tts.py
+++ b/TTS/bin/train_tts.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import sys
 from dataclasses import dataclass, field
 
 from trainer import Trainer, TrainerArgs
@@ -17,7 +18,7 @@ class TrainTTSArgs(TrainerArgs):
 
 def main():
     """Run `tts` model training directly by a `config.json` file."""
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     # init trainer args
     train_args = TrainTTSArgs()
diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py
index 221ff4cff0..7cf5696237 100644
--- a/TTS/bin/train_vocoder.py
+++ b/TTS/bin/train_vocoder.py
@@ -1,6 +1,8 @@
 import logging
 import os
+import sys
 from dataclasses import dataclass, field
+from typing import Optional
 
 from trainer import Trainer, TrainerArgs
 
@@ -16,16 +18,16 @@ class TrainVocoderArgs(TrainerArgs):
     config_path: str = field(default=None, metadata={"help": "Path to the config file."})
 
 
-def main():
+def main(arg_list: Optional[list[str]] = None):
     """Run `tts` model training directly by a `config.json` file."""
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     # init trainer args
     train_args = TrainVocoderArgs()
     parser = train_args.init_argparse(arg_prefix="")
 
     # override trainer args from comman-line args
-    args, config_overrides = parser.parse_known_args()
+    args, config_overrides = parser.parse_known_args(arg_list)
     train_args.parse_args(args)
 
     # load config.json and register
@@ -75,6 +77,7 @@ def main():
         parse_command_line_args=False,
     )
     trainer.fit()
+    sys.exit(0)
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py
index df2923952d..d05ae14b7f 100644
--- a/TTS/bin/tune_wavegrad.py
+++ b/TTS/bin/tune_wavegrad.py
@@ -2,6 +2,7 @@
 
 import argparse
 import logging
+import sys
 from itertools import product as cartesian_product
 
 import numpy as np
@@ -17,7 +18,7 @@
 from TTS.vocoder.models import setup_model
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py
index 5103f200b0..e5f40c0296 100644
--- a/TTS/config/__init__.py
+++ b/TTS/config/__init__.py
@@ -1,7 +1,7 @@
 import json
 import os
 import re
-from typing import Dict
+from typing import Any, Dict, Union
 
 import fsspec
 import yaml
@@ -68,7 +68,7 @@ def _process_model_name(config_dict: Dict) -> str:
     return model_name
 
 
-def load_config(config_path: str) -> Coqpit:
+def load_config(config_path: Union[str, os.PathLike[Any]]) -> Coqpit:
     """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
     to find the corresponding Config class. Then initialize the Config.
 
@@ -81,6 +81,7 @@ def load_config(config_path: str) -> Coqpit:
     Returns:
         Coqpit: TTS config object.
     """
+    config_path = str(config_path)
     config_dict = {}
     ext = os.path.splitext(config_path)[1]
     if ext in (".yml", ".yaml"):
diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py
index 2082019aad..603481cc56 100644
--- a/TTS/encoder/models/base_encoder.py
+++ b/TTS/encoder/models/base_encoder.py
@@ -64,11 +64,11 @@ def get_torch_mel_spectrogram_class(self, audio_config):
             ),
         )
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, x, l2_norm=True):
         return self.forward(x, l2_norm)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
         """
         Generate embeddings for a batch of utterances
diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py
index da7522a512..37619ed0f8 100644
--- a/TTS/encoder/utils/prepare_voxceleb.py
+++ b/TTS/encoder/utils/prepare_voxceleb.py
@@ -216,7 +216,7 @@ def processor(directory, subset, force_process):
 
 
 if __name__ == "__main__":
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
     if len(sys.argv) != 4:
         print("Usage: python prepare_data.py save_directory user password")
         sys.exit()
diff --git a/TTS/model.py b/TTS/model.py
index c3707c85ae..e024ad1a44 100644
--- a/TTS/model.py
+++ b/TTS/model.py
@@ -12,7 +12,7 @@
 class BaseTrainerModel(TrainerModel):
     """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
 
-    Every new 🐸TTS model must inherit it.
+    Every new Coqui model must inherit it.
     """
 
     @staticmethod
@@ -64,3 +64,7 @@ def load_checkpoint(
                 It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False.
         """
         ...
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
diff --git a/TTS/server/server.py b/TTS/server/server.py
index f410fb7539..cb4ed4d9b2 100644
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@@ -8,7 +8,6 @@
 import logging
 import os
 import sys
-from pathlib import Path
 from threading import Lock
 from typing import Union
 from urllib.parse import parse_qs
@@ -19,13 +18,12 @@
     msg = "Server requires requires flask, use `pip install coqui-tts[server]`"
     raise ImportError(msg) from e
 
-from TTS.config import load_config
+from TTS.api import TTS
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 from TTS.utils.manage import ModelManager
-from TTS.utils.synthesizer import Synthesizer
 
 logger = logging.getLogger(__name__)
-setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
 
 
 def create_argparser() -> argparse.ArgumentParser:
@@ -60,6 +58,7 @@ def create_argparser() -> argparse.ArgumentParser:
     parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
     parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
     parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
+    parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
     parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="true to use CUDA.")
     parser.add_argument(
         "--debug", action=argparse.BooleanOptionalAction, default=False, help="true to enable Flask debug mode."
@@ -73,8 +72,7 @@ def create_argparser() -> argparse.ArgumentParser:
 # parse the args
 args = create_argparser().parse_args()
 
-path = Path(__file__).parent / "../.models.json"
-manager = ModelManager(path)
+manager = ModelManager(models_file=TTS.get_models_file_path())
 
 # update in-use models to the specified released models.
 model_path = None
@@ -86,51 +84,27 @@ def create_argparser() -> argparse.ArgumentParser:
 # CASE1: list pre-trained TTS models
 if args.list_models:
     manager.list_models()
-    sys.exit()
-
-# CASE2: load pre-trained model paths
-if args.model_name is not None and not args.model_path:
-    model_path, config_path, model_item = manager.download_model(args.model_name)
-    args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
-
-if args.vocoder_name is not None and not args.vocoder_path:
-    vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
-
-# CASE3: set custom model paths
-if args.model_path is not None:
-    model_path = args.model_path
-    config_path = args.config_path
-    speakers_file_path = args.speakers_file_path
-
-if args.vocoder_path is not None:
-    vocoder_path = args.vocoder_path
-    vocoder_config_path = args.vocoder_config_path
-
-# load models
-synthesizer = Synthesizer(
-    tts_checkpoint=model_path,
-    tts_config_path=config_path,
-    tts_speakers_file=speakers_file_path,
-    tts_languages_file=None,
-    vocoder_checkpoint=vocoder_path,
-    vocoder_config=vocoder_config_path,
-    encoder_checkpoint="",
-    encoder_config="",
-    use_cuda=args.use_cuda,
-)
-
-use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
-    synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
-)
-speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
-
-use_multi_language = hasattr(synthesizer.tts_model, "num_languages") and (
-    synthesizer.tts_model.num_languages > 1 or synthesizer.tts_languages_file is not None
-)
-language_manager = getattr(synthesizer.tts_model, "language_manager", None)
+    sys.exit(0)
+
+device = args.device
+if args.use_cuda:
+    device = "cuda"
+
+# CASE2: load models
+model_name = args.model_name if args.model_path is None else None
+api = TTS(
+    model_name=model_name,
+    model_path=args.model_path,
+    config_path=args.config_path,
+    vocoder_name=args.vocoder_name,
+    vocoder_path=args.vocoder_path,
+    vocoder_config_path=args.vocoder_config_path,
+    speakers_file_path=args.speakers_file_path,
+    # language_ids_file_path=args.language_ids_file_path,
+).to(device)
 
 # TODO: set this from SpeakerManager
-use_gst = synthesizer.tts_config.get("use_gst", False)
+use_gst = api.synthesizer.tts_config.get("use_gst", False)
 app = Flask(__name__)
 
 
@@ -158,27 +132,18 @@ def index():
     return render_template(
         "index.html",
         show_details=args.show_details,
-        use_multi_speaker=use_multi_speaker,
-        use_multi_language=use_multi_language,
-        speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
-        language_ids=language_manager.name_to_id if language_manager is not None else None,
+        use_multi_speaker=api.is_multi_speaker,
+        use_multi_language=api.is_multi_lingual,
+        speaker_ids=api.speakers,
+        language_ids=api.languages,
         use_gst=use_gst,
     )
 
 
 @app.route("/details")
 def details():
-    if args.config_path is not None and os.path.isfile(args.config_path):
-        model_config = load_config(args.config_path)
-    elif args.model_name is not None:
-        model_config = load_config(config_path)
-
-    if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path):
-        vocoder_config = load_config(args.vocoder_config_path)
-    elif args.vocoder_name is not None:
-        vocoder_config = load_config(vocoder_config_path)
-    else:
-        vocoder_config = None
+    model_config = api.synthesizer.tts_config
+    vocoder_config = api.synthesizer.vocoder_config or None
 
     return render_template(
         "details.html",
@@ -196,17 +161,23 @@ def details():
 def tts():
     with lock:
         text = request.headers.get("text") or request.values.get("text", "")
-        speaker_idx = request.headers.get("speaker-id") or request.values.get("speaker_id", "")
-        language_idx = request.headers.get("language-id") or request.values.get("language_id", "")
+        speaker_idx = (
+            request.headers.get("speaker-id") or request.values.get("speaker_id", "") if api.is_multi_speaker else None
+        )
+        language_idx = (
+            request.headers.get("language-id") or request.values.get("language_id", "")
+            if api.is_multi_lingual
+            else None
+        )
         style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
         style_wav = style_wav_uri_to_dict(style_wav)
 
         logger.info("Model input: %s", text)
         logger.info("Speaker idx: %s", speaker_idx)
         logger.info("Language idx: %s", language_idx)
-        wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
+        wavs = api.tts(text, speaker=speaker_idx, language=language_idx, style_wav=style_wav)
         out = io.BytesIO()
-        synthesizer.save_wav(wavs, out)
+        api.synthesizer.save_wav(wavs, out)
     return send_file(out, mimetype="audio/wav")
 
 
@@ -248,9 +219,9 @@ def mary_tts_api_process():
         else:
             text = request.args.get("INPUT_TEXT", "")
         logger.info("Model input: %s", text)
-        wavs = synthesizer.tts(text)
+        wavs = api.tts(text)
         out = io.BytesIO()
-        synthesizer.save_wav(wavs, out)
+        api.synthesizer.save_wav(wavs, out)
     return send_file(out, mimetype="audio/wav")
 
 
diff --git a/TTS/tts/layers/bark/load_model.py b/TTS/tts/layers/bark/load_model.py
index 6b7caab916..c1e0d006cb 100644
--- a/TTS/tts/layers/bark/load_model.py
+++ b/TTS/tts/layers/bark/load_model.py
@@ -112,7 +112,9 @@ def load_model(ckpt_path, device, config, model_type="text"):
         os.remove(ckpt_path)
     if not os.path.exists(ckpt_path):
         logger.info(f"{model_type} model not found, downloading...")
-        _download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR)
+        # The URL in the config is a 404 and needs to be fixed
+        download_url = config.REMOTE_MODEL_PATHS[model_type]["path"].replace("tree", "resolve")
+        _download(download_url, ckpt_path, config.CACHE_DIR)
 
     checkpoint = torch.load(ckpt_path, map_location=device, weights_only=is_pytorch_at_least_2_4())
     # this is a hack
diff --git a/TTS/tts/layers/delightful_tts/acoustic_model.py b/TTS/tts/layers/delightful_tts/acoustic_model.py
index 981d6cdb1f..2aa82c9a88 100644
--- a/TTS/tts/layers/delightful_tts/acoustic_model.py
+++ b/TTS/tts/layers/delightful_tts/acoustic_model.py
@@ -421,7 +421,7 @@ def forward(
             "spk_emb": speaker_embedding,
         }
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         tokens: torch.Tensor,
diff --git a/TTS/tts/layers/xtts/hifigan_decoder.py b/TTS/tts/layers/xtts/hifigan_decoder.py
index 2e6ac01a87..550ad3e3b2 100644
--- a/TTS/tts/layers/xtts/hifigan_decoder.py
+++ b/TTS/tts/layers/xtts/hifigan_decoder.py
@@ -97,7 +97,7 @@ def forward(self, latents, g=None):
         o = self.waveform_decoder(z, g=g)
         return o
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, c, g):
         """
         Args:
diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py
index 44cf940c69..2f4b54cec1 100644
--- a/TTS/tts/layers/xtts/stream_generator.py
+++ b/TTS/tts/layers/xtts/stream_generator.py
@@ -45,7 +45,7 @@ def __init__(self, **kwargs):
 
 
 class NewGenerationMixin(GenerationMixin):
-    @torch.no_grad()
+    @torch.inference_mode()
     def generate(  # noqa: PLR0911
         self,
         inputs: Optional[torch.Tensor] = None,
@@ -662,7 +662,7 @@ def typeerror():
                 **model_kwargs,
             )
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def sample_stream(
         self,
         input_ids: torch.LongTensor,
diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py
index 076727239c..fec8358deb 100644
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@@ -501,12 +501,12 @@ def _remove_dots(m):
 
 def _expand_decimal_point(m, lang="en"):
     amount = m.group(1).replace(",", ".")
-    return num2words(float(amount), lang=lang if lang != "cs" else "cz")
+    return num2words(float(amount), lang=lang)
 
 
 def _expand_currency(m, lang="en", currency="USD"):
     amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
-    full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz")
+    full_amount = num2words(amount, to="currency", currency=currency, lang=lang)
 
     and_equivalents = {
         "en": ", ",
@@ -535,11 +535,11 @@ def _expand_currency(m, lang="en", currency="USD"):
 
 
 def _expand_ordinal(m, lang="en"):
-    return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
+    return num2words(int(m.group(1)), ordinal=True, lang=lang)
 
 
 def _expand_number(m, lang="en"):
-    return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
+    return num2words(int(m.group(0)), lang=lang)
 
 
 def expand_numbers_multilingual(text, lang="en"):
diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py
index 107054189c..9e8e753a61 100644
--- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py
+++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py
@@ -197,10 +197,6 @@ def __init__(self, config: Coqpit):
             mel_norm_file=self.args.mel_norm_file, sampling_rate=config.audio.dvae_sample_rate
         )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens):
         """
         Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
@@ -225,7 +221,7 @@ def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels
         )
         return losses
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def test_run(self, assets) -> Tuple[Dict, Dict]:  # pylint: disable=W0613
         test_audios = {}
         if self.config.test_sentences:
@@ -335,7 +331,7 @@ def on_init_end(self, trainer):  # pylint: disable=W0613
 
             WeightsFileHandler.add_pre_callback(callback_clearml_load_save)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         x,
diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py
index 28a52bc558..c1d0cf0aea 100644
--- a/TTS/tts/models/align_tts.py
+++ b/TTS/tts/models/align_tts.py
@@ -288,7 +288,7 @@ def forward(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, x, aux_input={"d_vectors": None}):  # pylint: disable=unused-argument
         """
         Shapes:
diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py
index ced8f60ed8..6a480e6f5c 100644
--- a/TTS/tts/models/bark.py
+++ b/TTS/tts/models/bark.py
@@ -1,5 +1,6 @@
 import os
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Optional
 
 import numpy as np
@@ -42,10 +43,6 @@ def __init__(
         self.encodec = EncodecModel.encodec_model_24khz()
         self.encodec.set_target_bandwidth(6.0)
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def load_bark_models(self):
         self.semantic_model, self.config = load_model(
             ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text"
@@ -206,12 +203,14 @@ def synthesize(
             speaker_wav (str): Path to the speaker audio file for cloning a new voice. It is cloned and saved in
                 `voice_dirs` with the name `speaker_id`. Defaults to None.
             voice_dirs (List[str]): List of paths that host reference audio files for speakers. Defaults to None.
-            **kwargs: Model specific inference settings used by `generate_audio()` and `TTS.tts.layers.bark.inference_funcs.generate_text_semantic().
+            **kwargs: Model specific inference settings used by `generate_audio()` and
+                      `TTS.tts.layers.bark.inference_funcs.generate_text_semantic()`.
 
         Returns:
-            A dictionary of the output values with `wav` as output waveform, `deterministic_seed` as seed used at inference,
-            `text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
-            as latents used at inference.
+            A dictionary of the output values with `wav` as output waveform,
+            `deterministic_seed` as seed used at inference, `text_input` as text token IDs
+            after tokenizer, `voice_samples` as samples used for cloning,
+            `conditioning_latents` as latents used at inference.
 
         """
         speaker_id = "random" if speaker_id is None else speaker_id
@@ -267,10 +266,12 @@ def load_checkpoint(
         fine_model_path = fine_model_path or os.path.join(checkpoint_dir, "fine_2.pt")
         hubert_tokenizer_path = hubert_tokenizer_path or os.path.join(checkpoint_dir, "tokenizer.pth")
 
+        # The paths in the default config start with /root/.local/share/tts and need to be fixed
         self.config.LOCAL_MODEL_PATHS["text"] = text_model_path
         self.config.LOCAL_MODEL_PATHS["coarse"] = coarse_model_path
         self.config.LOCAL_MODEL_PATHS["fine"] = fine_model_path
         self.config.LOCAL_MODEL_PATHS["hubert_tokenizer"] = hubert_tokenizer_path
+        self.config.CACHE_DIR = str(Path(text_model_path).parent)
 
         self.load_bark_models()
 
diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index ccb023ce84..33a75598c9 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -80,15 +80,17 @@ def _set_model_args(self, config: Coqpit):
             raise ValueError("config must be either a *Config or *Args")
 
     def init_multispeaker(self, config: Coqpit, data: List = None):
-        """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining
-        `in_channels` size of the connected layers.
+        """Set up for multi-speaker TTS.
+
+        Initialize a speaker embedding layer if needed and define expected embedding
+        channel size for defining `in_channels` size of the connected layers.
 
         This implementation yields 3 possible outcomes:
 
-        1. If `config.use_speaker_embedding` and `config.use_d_vector_file are False, do nothing.
+        1. If `config.use_speaker_embedding` and `config.use_d_vector_file` are False, do nothing.
         2. If `config.use_d_vector_file` is True, set expected embedding channel size to `config.d_vector_dim` or 512.
         3. If `config.use_speaker_embedding`, initialize a speaker embedding layer with channel size of
-        `config.d_vector_dim` or 512.
+           `config.d_vector_dim` or 512.
 
         You can override this function for new models.
 
diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py
index e6db116081..bee008e26f 100644
--- a/TTS/tts/models/delightful_tts.py
+++ b/TTS/tts/models/delightful_tts.py
@@ -438,10 +438,6 @@ def __init__(
                 periods=self.config.vocoder.periods_discriminator,
             )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     @property
     def energy_scaler(self):
         return self.acoustic_model.energy_scaler
@@ -622,7 +618,7 @@ def forward(
         model_outputs["slice_ids"] = slice_ids
         return model_outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self, x, aux_input={"d_vectors": None, "speaker_ids": None}, pitch_transform=None, energy_transform=None
     ):
@@ -646,7 +642,7 @@ def inference(
         model_outputs["model_outputs"] = vocoder_output
         return model_outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference_spec_decoder(self, x, aux_input={"d_vectors": None, "speaker_ids": None}):
         encoder_outputs = self.acoustic_model.inference(
             tokens=x,
@@ -1018,7 +1014,7 @@ def synthesize_with_gl(self, text: str, speaker_id, d_vector):
         }
         return return_dict
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def test_run(self, assets) -> Tuple[Dict, Dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py
index d09e3ea91b..03166fa8c0 100644
--- a/TTS/tts/models/forward_tts.py
+++ b/TTS/tts/models/forward_tts.py
@@ -628,7 +628,7 @@ def forward(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}):  # pylint: disable=unused-argument
         """Model's inference pass.
 
diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py
index 5bf4713140..aaf5190ada 100644
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@@ -262,7 +262,7 @@ def forward(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference_with_MAS(
         self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
     ):  # pylint: disable=dangerous-default-value
@@ -318,7 +318,7 @@ def inference_with_MAS(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def decoder_inference(
         self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
     ):  # pylint: disable=dangerous-default-value
@@ -341,7 +341,7 @@ def decoder_inference(
         outputs["logdet"] = logdet
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}
     ):  # pylint: disable=dangerous-default-value
@@ -464,7 +464,7 @@ def train_log(
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def eval_step(self, batch: dict, criterion: nn.Module):
         return self.train_step(batch, criterion)
 
@@ -473,7 +473,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py
index 0b3fadafbf..b9a23000a0 100644
--- a/TTS/tts/models/neuralhmm_tts.py
+++ b/TTS/tts/models/neuralhmm_tts.py
@@ -195,7 +195,7 @@ def _format_aux_input(self, aux_input: Dict, default_input_dict):
             return format_aux_input(default_input_dict, aux_input)
         return default_input_dict
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         text: torch.Tensor,
diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py
index ac09e406ad..10157e43a4 100644
--- a/TTS/tts/models/overflow.py
+++ b/TTS/tts/models/overflow.py
@@ -33,32 +33,33 @@ class Overflow(BaseTTS):
 
     Paper abstract::
         Neural HMMs are a type of neural transducer recently proposed for
-    sequence-to-sequence modelling in text-to-speech. They combine the best features
-    of classic statistical speech synthesis and modern neural TTS, requiring less
-    data and fewer training updates, and are less prone to gibberish output caused
-    by neural attention failures. In this paper, we combine neural HMM TTS with
-    normalising flows for describing the highly non-Gaussian distribution of speech
-    acoustics. The result is a powerful, fully probabilistic model of durations and
-    acoustics that can be trained using exact maximum likelihood. Compared to
-    dominant flow-based acoustic models, our approach integrates autoregression for
-    improved modelling of long-range dependences such as utterance-level prosody.
-    Experiments show that a system based on our proposal gives more accurate
-    pronunciations and better subjective speech quality than comparable methods,
-    whilst retaining the original advantages of neural HMMs. Audio examples and code
-    are available at https://shivammehta25.github.io/OverFlow/.
+        sequence-to-sequence modelling in text-to-speech. They combine the best features
+        of classic statistical speech synthesis and modern neural TTS, requiring less
+        data and fewer training updates, and are less prone to gibberish output caused
+        by neural attention failures. In this paper, we combine neural HMM TTS with
+        normalising flows for describing the highly non-Gaussian distribution of speech
+        acoustics. The result is a powerful, fully probabilistic model of durations and
+        acoustics that can be trained using exact maximum likelihood. Compared to
+        dominant flow-based acoustic models, our approach integrates autoregression for
+        improved modelling of long-range dependences such as utterance-level prosody.
+        Experiments show that a system based on our proposal gives more accurate
+        pronunciations and better subjective speech quality than comparable methods,
+        whilst retaining the original advantages of neural HMMs. Audio examples and code
+        are available at https://shivammehta25.github.io/OverFlow/.
 
     Note:
-        - Neural HMMs uses flat start initialization i.e it computes the means and std and transition probabilities
-        of the dataset and uses them to initialize the model. This benefits the model and helps with faster learning
-        If you change the dataset or want to regenerate the parameters change the `force_generate_statistics` and
-        `mel_statistics_parameter_path` accordingly.
+        - Neural HMMs uses flat start initialization i.e it computes the means
+          and std and transition probabilities of the dataset and uses them to initialize
+          the model. This benefits the model and helps with faster learning If you change
+          the dataset or want to regenerate the parameters change the
+          `force_generate_statistics` and `mel_statistics_parameter_path` accordingly.
 
         - To enable multi-GPU training, set the `use_grad_checkpointing=False` in config.
-        This will significantly increase the memory usage.  This is because to compute
-        the actual data likelihood (not an approximation using MAS/Viterbi) we must use
-        all the states at the previous time step during the forward pass to decide the
-        probability distribution at the current step i.e the difference between the forward
-        algorithm and viterbi approximation.
+          This will significantly increase the memory usage.  This is because to compute
+          the actual data likelihood (not an approximation using MAS/Viterbi) we must use
+          all the states at the previous time step during the forward pass to decide the
+          probability distribution at the current step i.e the difference between the forward
+          algorithm and viterbi approximation.
 
     Check :class:`TTS.tts.configs.overflow.OverFlowConfig` for class arguments.
     """
@@ -208,7 +209,7 @@ def _format_aux_input(self, aux_input: Dict, default_input_dict):
             return format_aux_input(default_input_dict, aux_input)
         return default_input_dict
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         text: torch.Tensor,
diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py
index 5d3efd2021..da85823f3f 100644
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@@ -218,7 +218,7 @@ def forward(  # pylint: disable=dangerous-default-value
         )
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, text_input, aux_input=None):
         aux_input = self._format_aux_input(aux_input)
         inputs = self.embedding(text_input)
diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py
index 2716a39786..e2edd4bb5c 100644
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@@ -238,7 +238,7 @@ def forward(  # pylint: disable=dangerous-default-value
         )
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, text, aux_input=None):
         """Forward pass for inference with no Teacher-Forcing.
 
diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py
index 01629b5d2a..738e9dd9b3 100644
--- a/TTS/tts/models/tortoise.py
+++ b/TTS/tts/models/tortoise.py
@@ -423,7 +423,9 @@ def get_conditioning_latents(
         Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
         These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
         properties.
-        :param voice_samples: List of arbitrary reference clips, which should be *pairs* of torch tensors containing arbitrary kHz waveform data.
+
+        :param voice_samples: List of arbitrary reference clips, which should be *pairs*
+                              of torch tensors containing arbitrary kHz waveform data.
         :param latent_averaging_mode: 0/1/2 for following modes:
             0 - latents will be generated as in original tortoise, using ~4.27s from each voice sample, averaging latent across all samples
             1 - latents will be generated using (almost) entire voice samples, averaged across all the ~4.27s chunks
@@ -671,7 +673,7 @@ def inference(
                 As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
             diffusion_temperature: (float) Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
                                       are the "mean" prediction of the diffusion network and will sound bland and smeared.
-            hf_generate_kwargs: (**kwargs) The huggingface Transformers generate API is used for the autoregressive transformer.
+            hf_generate_kwargs: (`**kwargs`) The huggingface Transformers generate API is used for the autoregressive transformer.
                                     Extra keyword args fed to this function get forwarded directly to that API. Documentation
                                     here: https://huggingface.co/docs/transformers/internal/generation_utils
 
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 7ec2519236..3d66b50598 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -565,10 +565,6 @@ def __init__(
                 use_spectral_norm=self.args.use_spectral_norm_disriminator,
             )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def init_multispeaker(self, config: Coqpit):
         """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
         or with external `d_vectors` computed from a speaker encoder model.
@@ -927,7 +923,7 @@ def _set_x_lengths(x, aux_input):
             return aux_input["x_lengths"]
         return torch.tensor(x.shape[1:2]).to(x.device)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         x,
@@ -1014,7 +1010,7 @@ def inference(
         }
         return outputs
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference_voice_conversion(
         self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None
     ):
@@ -1209,7 +1205,7 @@ def train_log(
         logger.train_figures(steps, figures)
         logger.train_audios(steps, audios, self.ap.sample_rate)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int):
         return self.train_step(batch, criterion, optimizer_idx)
 
@@ -1266,7 +1262,7 @@ def get_aux_input_from_test_sentences(self, sentence_info):
             "language_name": language_name,
         }
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def test_run(self, assets) -> Tuple[Dict, Dict]:
         """Generic test run for `tts` models used by `Trainer`.
 
@@ -1294,7 +1290,7 @@ def test_run(self, assets) -> Tuple[Dict, Dict]:
                 do_trim_silence=False,
             ).values()
             test_audios["{}-audio".format(idx)] = wav
-            test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)
+            test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.permute(2, 1, 0), output_fig=False)
         return {"figures": test_figures, "audios": test_audios}
 
     def test_log(
diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py
index f05863ae1d..c0a50800f6 100644
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@@ -178,7 +178,7 @@ class XttsArgs(Coqpit):
 
 
 class Xtts(BaseTTS):
-    """ⓍTTS model implementation.
+    """XTTS model implementation.
 
     ❗ Currently it only supports inference.
 
@@ -239,10 +239,6 @@ def init_models(self):
             cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer,
         )
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     @torch.inference_mode()
     def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = 6):
         """Compute the conditioning latents for the GPT model from the given audio.
@@ -460,7 +456,7 @@ def full_inference(
             gpt_cond_chunk_len: (int) Chunk length used for cloning. It must be <= `gpt_cond_len`.
                 If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to 6 seconds.
 
-            hf_generate_kwargs: (**kwargs) The huggingface Transformers generate API is used for the autoregressive
+            hf_generate_kwargs: (`**kwargs`) The huggingface Transformers generate API is used for the autoregressive
                 transformer. Extra keyword args fed to this function get forwarded directly to that API. Documentation
                 here: https://huggingface.co/docs/transformers/internal/generation_utils
 
diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py
index f134daf58e..c72de2d4e6 100644
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import fsspec
 import numpy as np
@@ -27,8 +27,8 @@ class LanguageManager(BaseIDManager):
 
     def __init__(
         self,
-        language_ids_file_path: str = "",
-        config: Coqpit = None,
+        language_ids_file_path: Union[str, os.PathLike[Any]] = "",
+        config: Optional[Coqpit] = None,
     ):
         super().__init__(id_file_path=language_ids_file_path)
 
@@ -76,7 +76,7 @@ def parse_ids_from_data(items: List, parse_key: str) -> Any:
     def set_ids_from_data(self, items: List, parse_key: str) -> Any:
         raise NotImplementedError
 
-    def save_ids_to_file(self, file_path: str) -> None:
+    def save_ids_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
         """Save language IDs to a json file.
 
         Args:
diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py
index 6a2f7df67b..e009a7c438 100644
--- a/TTS/tts/utils/managers.py
+++ b/TTS/tts/utils/managers.py
@@ -1,4 +1,5 @@
 import json
+import os
 import random
 from typing import Any, Dict, List, Tuple, Union
 
@@ -12,7 +13,8 @@
 from TTS.utils.generic_utils import is_pytorch_at_least_2_4
 
 
-def load_file(path: str):
+def load_file(path: Union[str, os.PathLike[Any]]):
+    path = str(path)
     if path.endswith(".json"):
         with fsspec.open(path, "r") as f:
             return json.load(f)
@@ -23,7 +25,8 @@ def load_file(path: str):
         raise ValueError("Unsupported file type")
 
 
-def save_file(obj: Any, path: str):
+def save_file(obj: Any, path: Union[str, os.PathLike[Any]]):
+    path = str(path)
     if path.endswith(".json"):
         with fsspec.open(path, "w") as f:
             json.dump(obj, f, indent=4)
@@ -39,20 +42,20 @@ class BaseIDManager:
     It defines common `ID` manager specific functions.
     """
 
-    def __init__(self, id_file_path: str = ""):
+    def __init__(self, id_file_path: Union[str, os.PathLike[Any]] = ""):
         self.name_to_id = {}
 
         if id_file_path:
             self.load_ids_from_file(id_file_path)
 
     @staticmethod
-    def _load_json(json_file_path: str) -> Dict:
-        with fsspec.open(json_file_path, "r") as f:
+    def _load_json(json_file_path: Union[str, os.PathLike[Any]]) -> Dict:
+        with fsspec.open(str(json_file_path), "r") as f:
             return json.load(f)
 
     @staticmethod
-    def _save_json(json_file_path: str, data: dict) -> None:
-        with fsspec.open(json_file_path, "w") as f:
+    def _save_json(json_file_path: Union[str, os.PathLike[Any]], data: dict) -> None:
+        with fsspec.open(str(json_file_path), "w") as f:
             json.dump(data, f, indent=4)
 
     def set_ids_from_data(self, items: List, parse_key: str) -> None:
@@ -63,7 +66,7 @@ def set_ids_from_data(self, items: List, parse_key: str) -> None:
         """
         self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key)
 
-    def load_ids_from_file(self, file_path: str) -> None:
+    def load_ids_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
         """Set IDs from a file.
 
         Args:
@@ -71,7 +74,7 @@ def load_ids_from_file(self, file_path: str) -> None:
         """
         self.name_to_id = load_file(file_path)
 
-    def save_ids_to_file(self, file_path: str) -> None:
+    def save_ids_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
         """Save IDs to a json file.
 
         Args:
@@ -130,10 +133,10 @@ class EmbeddingManager(BaseIDManager):
 
     def __init__(
         self,
-        embedding_file_path: Union[str, List[str]] = "",
-        id_file_path: str = "",
-        encoder_model_path: str = "",
-        encoder_config_path: str = "",
+        embedding_file_path: Union[Union[str, os.PathLike[Any]], list[Union[str, os.PathLike[Any]]]] = "",
+        id_file_path: Union[str, os.PathLike[Any]] = "",
+        encoder_model_path: Union[str, os.PathLike[Any]] = "",
+        encoder_config_path: Union[str, os.PathLike[Any]] = "",
         use_cuda: bool = False,
     ):
         super().__init__(id_file_path=id_file_path)
@@ -176,7 +179,7 @@ def embedding_names(self):
         """Get embedding names."""
         return list(self.embeddings_by_names.keys())
 
-    def save_embeddings_to_file(self, file_path: str) -> None:
+    def save_embeddings_to_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
         """Save embeddings to a json file.
 
         Args:
@@ -185,7 +188,7 @@ def save_embeddings_to_file(self, file_path: str) -> None:
         save_file(self.embeddings, file_path)
 
     @staticmethod
-    def read_embeddings_from_file(file_path: str):
+    def read_embeddings_from_file(file_path: Union[str, os.PathLike[Any]]):
         """Load embeddings from a json file.
 
         Args:
@@ -204,7 +207,7 @@ def read_embeddings_from_file(file_path: str):
                 embeddings_by_names[x["name"]].append(x["embedding"])
         return name_to_id, clip_ids, embeddings, embeddings_by_names
 
-    def load_embeddings_from_file(self, file_path: str) -> None:
+    def load_embeddings_from_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
         """Load embeddings from a json file.
 
         Args:
@@ -214,7 +217,7 @@ def load_embeddings_from_file(self, file_path: str) -> None:
             file_path
         )
 
-    def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None:
+    def load_embeddings_from_list_of_files(self, file_paths: list[Union[str, os.PathLike[Any]]]) -> None:
         """Load embeddings from a list of json files and don't allow duplicate keys.
 
         Args:
@@ -313,7 +316,9 @@ def get_random_embedding(self) -> Any:
     def get_clips(self) -> List:
         return sorted(self.embeddings.keys())
 
-    def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> None:
+    def init_encoder(
+        self, model_path: Union[str, os.PathLike[Any]], config_path: Union[str, os.PathLike[Any]], use_cuda=False
+    ) -> None:
         """Initialize a speaker encoder model.
 
         Args:
@@ -325,11 +330,14 @@ def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> Non
         self.encoder_config = load_config(config_path)
         self.encoder = setup_encoder_model(self.encoder_config)
         self.encoder_criterion = self.encoder.load_checkpoint(
-            self.encoder_config, model_path, eval=True, use_cuda=use_cuda, cache=True
+            self.encoder_config, str(model_path), eval=True, use_cuda=use_cuda, cache=True
         )
         self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
 
-    def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list:
+    @torch.inference_mode()
+    def compute_embedding_from_clip(
+        self, wav_file: Union[Union[str, os.PathLike[Any]], List[Union[str, os.PathLike[Any]]]]
+    ) -> list:
         """Compute a embedding from a given audio file.
 
         Args:
diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
index 5229af81c5..89c56583f5 100644
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@@ -1,7 +1,7 @@
 import json
 import logging
 import os
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
 
 import fsspec
 import numpy as np
@@ -56,11 +56,11 @@ class SpeakerManager(EmbeddingManager):
 
     def __init__(
         self,
-        data_items: List[List[Any]] = None,
+        data_items: Optional[list[list[Any]]] = None,
         d_vectors_file_path: str = "",
-        speaker_id_file_path: str = "",
-        encoder_model_path: str = "",
-        encoder_config_path: str = "",
+        speaker_id_file_path: Union[str, os.PathLike[Any]] = "",
+        encoder_model_path: Union[str, os.PathLike[Any]] = "",
+        encoder_config_path: Union[str, os.PathLike[Any]] = "",
         use_cuda: bool = False,
     ):
         super().__init__(
diff --git a/TTS/tts/utils/text/english/number_norm.py b/TTS/tts/utils/text/english/number_norm.py
index e8377ede87..c912e285e4 100644
--- a/TTS/tts/utils/text/english/number_norm.py
+++ b/TTS/tts/utils/text/english/number_norm.py
@@ -85,7 +85,11 @@ def _expand_number(m):
         if num % 100 == 0:
             return _inflect.number_to_words(num // 100) + " hundred"
         return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
-    return _inflect.number_to_words(num, andword="")
+    try:
+        text = _inflect.number_to_words(num, andword="")
+    except inflect.NumOutOfRangeError:
+        text = _inflect.number_to_words(num, group=1).replace(", ", " ")
+    return text
 
 
 def normalize_numbers(text):
diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py
index 9c83009b0f..0cba7fc8a8 100644
--- a/TTS/utils/audio/numpy_transforms.py
+++ b/TTS/utils/audio/numpy_transforms.py
@@ -1,6 +1,7 @@
 import logging
+import os
 from io import BytesIO
-from typing import Optional
+from typing import Any, Optional, Union
 
 import librosa
 import numpy as np
@@ -406,7 +407,9 @@ def rms_volume_norm(*, x: np.ndarray, db_level: float = -27.0, **kwargs) -> np.n
     return rms_norm(wav=x, db_level=db_level)
 
 
-def load_wav(*, filename: str, sample_rate: Optional[int] = None, resample: bool = False, **kwargs) -> np.ndarray:
+def load_wav(
+    *, filename: Union[str, os.PathLike[Any]], sample_rate: Optional[int] = None, resample: bool = False, **kwargs
+) -> np.ndarray:
     """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
 
     Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
@@ -434,7 +437,7 @@ def load_wav(*, filename: str, sample_rate: Optional[int] = None, resample: bool
 def save_wav(
     *,
     wav: np.ndarray,
-    path: str,
+    path: Union[str, os.PathLike[Any]],
     sample_rate: int,
     pipe_out=None,
     do_rms_norm: bool = False,
diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py
index 1d8fed8e39..bf07333aea 100644
--- a/TTS/utils/audio/processor.py
+++ b/TTS/utils/audio/processor.py
@@ -1,5 +1,6 @@
 import logging
-from typing import Optional
+import os
+from typing import Any, Optional, Union
 
 import librosa
 import numpy as np
@@ -548,7 +549,7 @@ def sound_norm(x: np.ndarray) -> np.ndarray:
         return volume_norm(x=x)
 
     ### save and load ###
-    def load_wav(self, filename: str, sr: Optional[int] = None) -> np.ndarray:
+    def load_wav(self, filename: Union[str, os.PathLike[Any]], sr: Optional[int] = None) -> np.ndarray:
         """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
 
         Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
@@ -575,7 +576,9 @@ def load_wav(self, filename: str, sr: Optional[int] = None) -> np.ndarray:
             x = rms_volume_norm(x=x, db_level=self.db_level)
         return x
 
-    def save_wav(self, wav: np.ndarray, path: str, sr: Optional[int] = None, pipe_out=None) -> None:
+    def save_wav(
+        self, wav: np.ndarray, path: Union[str, os.PathLike[Any]], sr: Optional[int] = None, pipe_out=None
+    ) -> None:
         """Save a waveform to a file using Scipy.
 
         Args:
diff --git a/TTS/utils/callbacks.py b/TTS/utils/callbacks.py
deleted file mode 100644
index 511d215c65..0000000000
--- a/TTS/utils/callbacks.py
+++ /dev/null
@@ -1,105 +0,0 @@
-class TrainerCallback:
-    @staticmethod
-    def on_init_start(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_init_start"):
-                trainer.model.module.on_init_start(trainer)
-        else:
-            if hasattr(trainer.model, "on_init_start"):
-                trainer.model.on_init_start(trainer)
-
-        if hasattr(trainer.criterion, "on_init_start"):
-            trainer.criterion.on_init_start(trainer)
-
-        if hasattr(trainer.optimizer, "on_init_start"):
-            trainer.optimizer.on_init_start(trainer)
-
-    @staticmethod
-    def on_init_end(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_init_end"):
-                trainer.model.module.on_init_end(trainer)
-        else:
-            if hasattr(trainer.model, "on_init_end"):
-                trainer.model.on_init_end(trainer)
-
-        if hasattr(trainer.criterion, "on_init_end"):
-            trainer.criterion.on_init_end(trainer)
-
-        if hasattr(trainer.optimizer, "on_init_end"):
-            trainer.optimizer.on_init_end(trainer)
-
-    @staticmethod
-    def on_epoch_start(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_epoch_start"):
-                trainer.model.module.on_epoch_start(trainer)
-        else:
-            if hasattr(trainer.model, "on_epoch_start"):
-                trainer.model.on_epoch_start(trainer)
-
-        if hasattr(trainer.criterion, "on_epoch_start"):
-            trainer.criterion.on_epoch_start(trainer)
-
-        if hasattr(trainer.optimizer, "on_epoch_start"):
-            trainer.optimizer.on_epoch_start(trainer)
-
-    @staticmethod
-    def on_epoch_end(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_epoch_end"):
-                trainer.model.module.on_epoch_end(trainer)
-        else:
-            if hasattr(trainer.model, "on_epoch_end"):
-                trainer.model.on_epoch_end(trainer)
-
-        if hasattr(trainer.criterion, "on_epoch_end"):
-            trainer.criterion.on_epoch_end(trainer)
-
-        if hasattr(trainer.optimizer, "on_epoch_end"):
-            trainer.optimizer.on_epoch_end(trainer)
-
-    @staticmethod
-    def on_train_step_start(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_train_step_start"):
-                trainer.model.module.on_train_step_start(trainer)
-        else:
-            if hasattr(trainer.model, "on_train_step_start"):
-                trainer.model.on_train_step_start(trainer)
-
-        if hasattr(trainer.criterion, "on_train_step_start"):
-            trainer.criterion.on_train_step_start(trainer)
-
-        if hasattr(trainer.optimizer, "on_train_step_start"):
-            trainer.optimizer.on_train_step_start(trainer)
-
-    @staticmethod
-    def on_train_step_end(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_train_step_end"):
-                trainer.model.module.on_train_step_end(trainer)
-        else:
-            if hasattr(trainer.model, "on_train_step_end"):
-                trainer.model.on_train_step_end(trainer)
-
-        if hasattr(trainer.criterion, "on_train_step_end"):
-            trainer.criterion.on_train_step_end(trainer)
-
-        if hasattr(trainer.optimizer, "on_train_step_end"):
-            trainer.optimizer.on_train_step_end(trainer)
-
-    @staticmethod
-    def on_keyboard_interrupt(trainer) -> None:
-        if hasattr(trainer.model, "module"):
-            if hasattr(trainer.model.module, "on_keyboard_interrupt"):
-                trainer.model.module.on_keyboard_interrupt(trainer)
-        else:
-            if hasattr(trainer.model, "on_keyboard_interrupt"):
-                trainer.model.on_keyboard_interrupt(trainer)
-
-        if hasattr(trainer.criterion, "on_keyboard_interrupt"):
-            trainer.criterion.on_keyboard_interrupt(trainer)
-
-        if hasattr(trainer.optimizer, "on_keyboard_interrupt"):
-            trainer.optimizer.on_keyboard_interrupt(trainer)
diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py
index 087ae7d0e1..77566c3f6a 100644
--- a/TTS/utils/generic_utils.py
+++ b/TTS/utils/generic_utils.py
@@ -2,9 +2,10 @@
 import datetime
 import importlib
 import logging
+import os
 import re
 from pathlib import Path
-from typing import Callable, Dict, Optional, TypeVar, Union
+from typing import Any, Callable, Dict, Optional, TextIO, TypeVar, Union
 
 import torch
 from packaging.version import Version
@@ -30,6 +31,7 @@ def to_camel(text):
     text = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
     text = text.replace("Tts", "TTS")
     text = text.replace("vc", "VC")
+    text = text.replace("Knn", "KNN")
     return text
 
 
@@ -107,25 +109,34 @@ def setup_logger(
     level: int = logging.INFO,
     *,
     formatter: Optional[logging.Formatter] = None,
-    screen: bool = False,
-    tofile: bool = False,
-    log_dir: str = "logs",
+    stream: Optional[TextIO] = None,
+    log_dir: Optional[Union[str, os.PathLike[Any]]] = None,
     log_name: str = "log",
 ) -> None:
+    """Set up a logger.
+
+    Args:
+        logger_name: Name of the logger to set up
+        level: Logging level
+        formatter: Formatter for the logger
+        stream: Add a StreamHandler for the given stream, e.g. sys.stderr or sys.stdout
+        log_dir: Folder to write the log file (no file created if None)
+        log_name: Prefix of the log file name
+    """
     lg = logging.getLogger(logger_name)
     if formatter is None:
         formatter = logging.Formatter(
             "%(asctime)s.%(msecs)03d - %(levelname)-8s - %(name)s: %(message)s", datefmt="%y-%m-%d %H:%M:%S"
         )
     lg.setLevel(level)
-    if tofile:
+    if log_dir is not None:
         Path(log_dir).mkdir(exist_ok=True, parents=True)
         log_file = Path(log_dir) / f"{log_name}_{get_timestamp()}.log"
         fh = logging.FileHandler(log_file, mode="w")
         fh.setFormatter(formatter)
         lg.addHandler(fh)
-    if screen:
-        sh = logging.StreamHandler()
+    if stream is not None:
+        sh = logging.StreamHandler(stream)
         sh.setFormatter(formatter)
         lg.addHandler(sh)
 
@@ -133,3 +144,8 @@ def setup_logger(
 def is_pytorch_at_least_2_4() -> bool:
     """Check if the installed Pytorch version is 2.4 or higher."""
     return Version(torch.__version__) >= Version("2.4")
+
+
+def optional_to_str(x: Optional[Any]) -> str:
+    """Convert input to string, using empty string if input is None."""
+    return "" if x is None else str(x)
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index 38fcfd60e9..5dff1b84c8 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -6,17 +6,36 @@
 import zipfile
 from pathlib import Path
 from shutil import copyfile, rmtree
-from typing import Dict, Tuple
+from typing import Any, Optional, TypedDict, Union
 
 import fsspec
 import requests
 from tqdm import tqdm
 from trainer.io import get_user_data_dir
+from typing_extensions import Required
 
 from TTS.config import load_config, read_json_with_comments
+from TTS.vc.configs.knnvc_config import KNNVCConfig
 
 logger = logging.getLogger(__name__)
 
+
+class ModelItem(TypedDict, total=False):
+    model_name: Required[str]
+    model_type: Required[str]
+    description: str
+    license: str
+    author: str
+    contact: str
+    commit: Optional[str]
+    model_hash: str
+    tos_required: bool
+    default_vocoder: Optional[str]
+    model_url: Union[str, list[str]]
+    github_rls_url: Union[str, list[str]]
+    hf_url: list[str]
+
+
 LICENSE_URLS = {
     "cc by-nc-nd 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/",
     "mpl": "https://www.mozilla.org/en-US/MPL/2.0/",
@@ -40,19 +59,24 @@ class ModelManager(object):
     home path.
 
     Args:
-        models_file (str): path to .model.json file. Defaults to None.
-        output_prefix (str): prefix to `tts` to download models. Defaults to None
+        models_file (str or Path): path to .model.json file. Defaults to None.
+        output_prefix (str or Path): prefix to `tts` to download models. Defaults to None
         progress_bar (bool): print a progress bar when donwloading a file. Defaults to False.
     """
 
-    def __init__(self, models_file=None, output_prefix=None, progress_bar=False):
+    def __init__(
+        self,
+        models_file: Optional[Union[str, os.PathLike[Any]]] = None,
+        output_prefix: Optional[Union[str, os.PathLike[Any]]] = None,
+        progress_bar: bool = False,
+    ) -> None:
         super().__init__()
         self.progress_bar = progress_bar
         if output_prefix is None:
             self.output_prefix = get_user_data_dir("tts")
         else:
-            self.output_prefix = os.path.join(output_prefix, "tts")
-        self.models_dict = None
+            self.output_prefix = Path(output_prefix) / "tts"
+        self.models_dict = {}
         if models_file is not None:
             self.read_models_file(models_file)
         else:
@@ -60,7 +84,7 @@ def __init__(self, models_file=None, output_prefix=None, progress_bar=False):
             path = Path(__file__).parent / "../.models.json"
             self.read_models_file(path)
 
-    def read_models_file(self, file_path):
+    def read_models_file(self, file_path: Union[str, os.PathLike[Any]]) -> None:
         """Read .models.json as a dict
 
         Args:
@@ -68,7 +92,7 @@ def read_models_file(self, file_path):
         """
         self.models_dict = read_json_with_comments(file_path)
 
-    def _list_models(self, model_type, model_count=0):
+    def _list_models(self, model_type: str, model_count: int = 0) -> list[str]:
         logger.info("")
         logger.info("Name format: type/language/dataset/model")
         model_list = []
@@ -83,21 +107,23 @@ def _list_models(self, model_type, model_count=0):
                     model_count += 1
         return model_list
 
-    def _list_for_model_type(self, model_type):
+    def _list_for_model_type(self, model_type: str) -> list[str]:
         models_name_list = []
         model_count = 1
         models_name_list.extend(self._list_models(model_type, model_count))
         return models_name_list
 
-    def list_models(self):
+    def list_models(self) -> list[str]:
         models_name_list = []
         model_count = 1
         for model_type in self.models_dict:
             model_list = self._list_models(model_type, model_count)
             models_name_list.extend(model_list)
+        logger.info("")
+        logger.info("Path to downloaded models: %s", self.output_prefix)
         return models_name_list
 
-    def log_model_details(self, model_type, lang, dataset, model):
+    def log_model_details(self, model_type: str, lang: str, dataset: str, model: str) -> None:
         logger.info("Model type: %s", model_type)
         logger.info("Language supported: %s", lang)
         logger.info("Dataset used: %s", dataset)
@@ -112,7 +138,7 @@ def log_model_details(self, model_type, lang, dataset, model):
                 self.models_dict[model_type][lang][dataset][model]["default_vocoder"],
             )
 
-    def model_info_by_idx(self, model_query):
+    def model_info_by_idx(self, model_query: str) -> None:
         """Print the description of the model from .models.json file using model_query_idx
 
         Args:
@@ -144,7 +170,7 @@ def model_info_by_idx(self, model_query):
             model_type, lang, dataset, model = model_name_list[model_query_idx - 1].split("/")
             self.log_model_details(model_type, lang, dataset, model)
 
-    def model_info_by_full_name(self, model_query_name):
+    def model_info_by_full_name(self, model_query_name: str) -> None:
         """Print the description of the model from .models.json file using model_full_name
 
         Args:
@@ -165,35 +191,35 @@ def model_info_by_full_name(self, model_query_name):
             return
         self.log_model_details(model_type, lang, dataset, model)
 
-    def list_tts_models(self):
+    def list_tts_models(self) -> list[str]:
         """Print all `TTS` models and return a list of model names
 
         Format is `language/dataset/model`
         """
         return self._list_for_model_type("tts_models")
 
-    def list_vocoder_models(self):
+    def list_vocoder_models(self) -> list[str]:
         """Print all the `vocoder` models and return a list of model names
 
         Format is `language/dataset/model`
         """
         return self._list_for_model_type("vocoder_models")
 
-    def list_vc_models(self):
+    def list_vc_models(self) -> list[str]:
         """Print all the voice conversion models and return a list of model names
 
         Format is `language/dataset/model`
         """
         return self._list_for_model_type("voice_conversion_models")
 
-    def list_langs(self):
+    def list_langs(self) -> None:
         """Print all the available languages"""
         logger.info("Name format: type/language")
         for model_type in self.models_dict:
             for lang in self.models_dict[model_type]:
                 logger.info("  %s/%s", model_type, lang)
 
-    def list_datasets(self):
+    def list_datasets(self) -> None:
         """Print all the datasets"""
         logger.info("Name format: type/language/dataset")
         for model_type in self.models_dict:
@@ -202,7 +228,7 @@ def list_datasets(self):
                     logger.info("  %s/%s/%s", model_type, lang, dataset)
 
     @staticmethod
-    def print_model_license(model_item: Dict):
+    def print_model_license(model_item: ModelItem) -> None:
         """Print the license of a model
 
         Args:
@@ -217,49 +243,49 @@ def print_model_license(model_item: Dict):
         else:
             logger.info("Model's license - No license information available")
 
-    def _download_github_model(self, model_item: Dict, output_path: str):
+    def _download_github_model(self, model_item: ModelItem, output_path: Path) -> None:
         if isinstance(model_item["github_rls_url"], list):
             self._download_model_files(model_item["github_rls_url"], output_path, self.progress_bar)
         else:
             self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar)
 
-    def _download_hf_model(self, model_item: Dict, output_path: str):
+    def _download_hf_model(self, model_item: ModelItem, output_path: Path) -> None:
         if isinstance(model_item["hf_url"], list):
             self._download_model_files(model_item["hf_url"], output_path, self.progress_bar)
         else:
             self._download_zip_file(model_item["hf_url"], output_path, self.progress_bar)
 
-    def download_fairseq_model(self, model_name, output_path):
+    def download_fairseq_model(self, model_name: str, output_path: Path) -> None:
         URI_PREFIX = "https://dl.fbaipublicfiles.com/mms/tts/"
         _, lang, _, _ = model_name.split("/")
         model_download_uri = os.path.join(URI_PREFIX, f"{lang}.tar.gz")
         self._download_tar_file(model_download_uri, output_path, self.progress_bar)
 
     @staticmethod
-    def set_model_url(model_item: Dict):
-        model_item["model_url"] = None
+    def set_model_url(model_item: ModelItem) -> ModelItem:
+        model_item["model_url"] = ""
         if "github_rls_url" in model_item:
             model_item["model_url"] = model_item["github_rls_url"]
         elif "hf_url" in model_item:
             model_item["model_url"] = model_item["hf_url"]
-        elif "fairseq" in model_item["model_name"]:
+        elif "fairseq" in model_item.get("model_name", ""):
             model_item["model_url"] = "https://dl.fbaipublicfiles.com/mms/tts/"
-        elif "xtts" in model_item["model_name"]:
+        elif "xtts" in model_item.get("model_name", ""):
             model_item["model_url"] = "https://huggingface.co/coqui/"
         return model_item
 
-    def _set_model_item(self, model_name):
+    def _set_model_item(self, model_name: str) -> tuple[ModelItem, str, str, Optional[str]]:
         # fetch model info from the dict
         if "fairseq" in model_name:
             model_type, lang, dataset, model = model_name.split("/")
-            model_item = {
+            model_item: ModelItem = {
+                "model_name": model_name,
                 "model_type": "tts_models",
                 "license": "CC BY-NC 4.0",
                 "default_vocoder": None,
                 "author": "fairseq",
                 "description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.",
             }
-            model_item["model_name"] = model_name
         elif "xtts" in model_name and len(model_name.split("/")) != 4:
             # loading xtts models with only model name (e.g. xtts_v2.0.2)
             # check model name has the version number with regex
@@ -273,6 +299,8 @@ def _set_model_item(self, model_name):
             dataset = "multi-dataset"
             model = model_name
             model_item = {
+                "model_name": model_name,
+                "model_type": model_type,
                 "default_vocoder": None,
                 "license": "CPML",
                 "contact": "info@coqui.ai",
@@ -297,9 +325,9 @@ def _set_model_item(self, model_name):
         return model_item, model_full_name, model, md5hash
 
     @staticmethod
-    def ask_tos(model_full_path):
+    def ask_tos(model_full_path: Path) -> bool:
         """Ask the user to agree to the terms of service"""
-        tos_path = os.path.join(model_full_path, "tos_agreed.txt")
+        tos_path = model_full_path / "tos_agreed.txt"
         print(" > You must confirm the following:")
         print(' | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"')
         print(' | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]')
@@ -311,7 +339,7 @@ def ask_tos(model_full_path):
         return False
 
     @staticmethod
-    def tos_agreed(model_item, model_full_path):
+    def tos_agreed(model_item: ModelItem, model_full_path: Path) -> bool:
         """Check if the user has agreed to the terms of service"""
         if "tos_required" in model_item and model_item["tos_required"]:
             tos_path = os.path.join(model_full_path, "tos_agreed.txt")
@@ -320,12 +348,12 @@ def tos_agreed(model_item, model_full_path):
             return False
         return True
 
-    def create_dir_and_download_model(self, model_name, model_item, output_path):
-        os.makedirs(output_path, exist_ok=True)
+    def create_dir_and_download_model(self, model_name: str, model_item: ModelItem, output_path: Path) -> None:
+        output_path.mkdir(exist_ok=True, parents=True)
         # handle TOS
         if not self.tos_agreed(model_item, output_path):
             if not self.ask_tos(output_path):
-                os.rmdir(output_path)
+                output_path.rmdir()
                 raise Exception(" [!] You must agree to the terms of service to use this model.")
         logger.info("Downloading model to %s", output_path)
         try:
@@ -340,9 +368,12 @@ def create_dir_and_download_model(self, model_name, model_item, output_path):
             logger.exception("Failed to download the model file to %s", output_path)
             rmtree(output_path)
             raise e
+        checkpoints = list(Path(output_path).glob("*.pt*"))
+        if len(checkpoints) == 1:
+            checkpoints[0].rename(checkpoints[0].parent / "model.pth")
         self.print_model_license(model_item=model_item)
 
-    def check_if_configs_are_equal(self, model_name, model_item, output_path):
+    def check_if_configs_are_equal(self, model_name: str, model_item: ModelItem, output_path: Path) -> None:
         with fsspec.open(self._find_files(output_path)[1], "r", encoding="utf-8") as f:
             config_local = json.load(f)
         remote_url = None
@@ -358,7 +389,7 @@ def check_if_configs_are_equal(self, model_name, model_item, output_path):
             logger.info("%s is already downloaded however it has been changed. Redownloading it...", model_name)
             self.create_dir_and_download_model(model_name, model_item, output_path)
 
-    def download_model(self, model_name):
+    def download_model(self, model_name: str) -> tuple[Path, Optional[Path], ModelItem]:
         """Download model files given the full model name.
         Model name is in the format
             'type/language/dataset/model'
@@ -374,12 +405,12 @@ def download_model(self, model_name):
         """
         model_item, model_full_name, model, md5sum = self._set_model_item(model_name)
         # set the model specific output path
-        output_path = os.path.join(self.output_prefix, model_full_name)
-        if os.path.exists(output_path):
+        output_path = Path(self.output_prefix) / model_full_name
+        if output_path.is_dir():
             if md5sum is not None:
-                md5sum_file = os.path.join(output_path, "hash.md5")
-                if os.path.isfile(md5sum_file):
-                    with open(md5sum_file, mode="r") as f:
+                md5sum_file = output_path / "hash.md5"
+                if md5sum_file.is_file():
+                    with md5sum_file.open() as f:
                         if not f.read() == md5sum:
                             logger.info("%s has been updated, clearing model cache...", model_name)
                             self.create_dir_and_download_model(model_name, model_item, output_path)
@@ -404,15 +435,20 @@ def download_model(self, model_name):
         output_model_path = output_path
         output_config_path = None
         if (
-            model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name and "xtts" not in model_name
+            model not in ["tortoise-v2", "bark", "knnvc"] and "fairseq" not in model_name and "xtts" not in model_name
         ):  # TODO:This is stupid but don't care for now.
             output_model_path, output_config_path = self._find_files(output_path)
+        else:
+            output_config_path = output_model_path / "config.json"
+        if model == "knnvc" and not output_config_path.exists():
+            knnvc_config = KNNVCConfig()
+            knnvc_config.save_json(output_config_path)
         # update paths in the config.json
         self._update_paths(output_path, output_config_path)
         return output_model_path, output_config_path, model_item
 
     @staticmethod
-    def _find_files(output_path: str) -> Tuple[str, str]:
+    def _find_files(output_path: Path) -> tuple[Path, Path]:
         """Find the model and config files in the output path
 
         Args:
@@ -423,11 +459,11 @@ def _find_files(output_path: str) -> Tuple[str, str]:
         """
         model_file = None
         config_file = None
-        for file_name in os.listdir(output_path):
-            if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth", "checkpoint.pth"]:
-                model_file = os.path.join(output_path, file_name)
-            elif file_name == "config.json":
-                config_file = os.path.join(output_path, file_name)
+        for f in output_path.iterdir():
+            if f.name in ["model_file.pth", "model_file.pth.tar", "model.pth", "checkpoint.pth"]:
+                model_file = f
+            elif f.name == "config.json":
+                config_file = f
         if model_file is None:
             raise ValueError(" [!] Model file not found in the output path")
         if config_file is None:
@@ -435,7 +471,7 @@ def _find_files(output_path: str) -> Tuple[str, str]:
         return model_file, config_file
 
     @staticmethod
-    def _find_speaker_encoder(output_path: str) -> str:
+    def _find_speaker_encoder(output_path: Path) -> Optional[Path]:
         """Find the speaker encoder file in the output path
 
         Args:
@@ -445,24 +481,24 @@ def _find_speaker_encoder(output_path: str) -> str:
             str: path to the speaker encoder file
         """
         speaker_encoder_file = None
-        for file_name in os.listdir(output_path):
-            if file_name in ["model_se.pth", "model_se.pth.tar"]:
-                speaker_encoder_file = os.path.join(output_path, file_name)
+        for f in output_path.iterdir():
+            if f.name in ["model_se.pth", "model_se.pth.tar"]:
+                speaker_encoder_file = f
         return speaker_encoder_file
 
-    def _update_paths(self, output_path: str, config_path: str) -> None:
+    def _update_paths(self, output_path: Path, config_path: Path) -> None:
         """Update paths for certain files in config.json after download.
 
         Args:
             output_path (str): local path the model is downloaded to.
             config_path (str): local config.json path.
         """
-        output_stats_path = os.path.join(output_path, "scale_stats.npy")
-        output_d_vector_file_path = os.path.join(output_path, "speakers.json")
-        output_d_vector_file_pth_path = os.path.join(output_path, "speakers.pth")
-        output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json")
-        output_speaker_ids_file_pth_path = os.path.join(output_path, "speaker_ids.pth")
-        speaker_encoder_config_path = os.path.join(output_path, "config_se.json")
+        output_stats_path = output_path / "scale_stats.npy"
+        output_d_vector_file_path = output_path / "speakers.json"
+        output_d_vector_file_pth_path = output_path / "speakers.pth"
+        output_speaker_ids_file_path = output_path / "speaker_ids.json"
+        output_speaker_ids_file_pth_path = output_path / "speaker_ids.pth"
+        speaker_encoder_config_path = output_path / "config_se.json"
         speaker_encoder_model_path = self._find_speaker_encoder(output_path)
 
         # update the scale_path.npy file path in the model config.json
@@ -487,10 +523,10 @@ def _update_paths(self, output_path: str, config_path: str) -> None:
         self._update_path("model_args.speaker_encoder_config_path", speaker_encoder_config_path, config_path)
 
     @staticmethod
-    def _update_path(field_name, new_path, config_path):
+    def _update_path(field_name: str, new_path: Optional[Path], config_path: Path) -> None:
         """Update the path in the model config.json for the current environment after download"""
-        if new_path and os.path.exists(new_path):
-            config = load_config(config_path)
+        if new_path is not None and new_path.is_file():
+            config = load_config(str(config_path))
             field_names = field_name.split(".")
             if len(field_names) > 1:
                 # field name points to a sub-level field
@@ -515,7 +551,7 @@ def _update_path(field_name, new_path, config_path):
             config.save_json(config_path)
 
     @staticmethod
-    def _download_zip_file(file_url, output_folder, progress_bar):
+    def _download_zip_file(file_url: str, output_folder: Path, progress_bar: bool) -> None:
         """Download the github releases"""
         # download the file
         r = requests.get(file_url, stream=True)
@@ -525,7 +561,7 @@ def _download_zip_file(file_url, output_folder, progress_bar):
             block_size = 1024  # 1 Kibibyte
             if progress_bar:
                 ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
-            temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1])
+            temp_zip_name = output_folder / file_url.split("/")[-1]
             with open(temp_zip_name, "wb") as file:
                 for data in r.iter_content(block_size):
                     if progress_bar:
@@ -533,24 +569,24 @@ def _download_zip_file(file_url, output_folder, progress_bar):
                     file.write(data)
             with zipfile.ZipFile(temp_zip_name) as z:
                 z.extractall(output_folder)
-            os.remove(temp_zip_name)  # delete zip after extract
+            temp_zip_name.unlink()  # delete zip after extract
         except zipfile.BadZipFile:
             logger.exception("Bad zip file - %s", file_url)
             raise zipfile.BadZipFile  # pylint: disable=raise-missing-from
         # move the files to the outer path
         for file_path in z.namelist():
-            src_path = os.path.join(output_folder, file_path)
-            if os.path.isfile(src_path):
-                dst_path = os.path.join(output_folder, os.path.basename(file_path))
+            src_path = output_folder / file_path
+            if src_path.is_file():
+                dst_path = output_folder / os.path.basename(file_path)
                 if src_path != dst_path:
                     copyfile(src_path, dst_path)
         # remove redundant (hidden or not) folders
         for file_path in z.namelist():
-            if os.path.isdir(os.path.join(output_folder, file_path)):
-                rmtree(os.path.join(output_folder, file_path))
+            if (output_folder / file_path).is_dir():
+                rmtree(output_folder / file_path)
 
     @staticmethod
-    def _download_tar_file(file_url, output_folder, progress_bar):
+    def _download_tar_file(file_url: str, output_folder: Path, progress_bar: bool) -> None:
         """Download the github releases"""
         # download the file
         r = requests.get(file_url, stream=True)
@@ -560,7 +596,7 @@ def _download_tar_file(file_url, output_folder, progress_bar):
             block_size = 1024  # 1 Kibibyte
             if progress_bar:
                 ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
-            temp_tar_name = os.path.join(output_folder, file_url.split("/")[-1])
+            temp_tar_name = output_folder / file_url.split("/")[-1]
             with open(temp_tar_name, "wb") as file:
                 for data in r.iter_content(block_size):
                     if progress_bar:
@@ -569,43 +605,37 @@ def _download_tar_file(file_url, output_folder, progress_bar):
             with tarfile.open(temp_tar_name) as t:
                 t.extractall(output_folder)
                 tar_names = t.getnames()
-            os.remove(temp_tar_name)  # delete tar after extract
+            temp_tar_name.unlink()  # delete tar after extract
         except tarfile.ReadError:
             logger.exception("Bad tar file - %s", file_url)
             raise tarfile.ReadError  # pylint: disable=raise-missing-from
         # move the files to the outer path
-        for file_path in os.listdir(os.path.join(output_folder, tar_names[0])):
-            src_path = os.path.join(output_folder, tar_names[0], file_path)
-            dst_path = os.path.join(output_folder, os.path.basename(file_path))
+        for file_path in (output_folder / tar_names[0]).iterdir():
+            src_path = file_path
+            dst_path = output_folder / file_path.name
             if src_path != dst_path:
                 copyfile(src_path, dst_path)
         # remove the extracted folder
-        rmtree(os.path.join(output_folder, tar_names[0]))
+        rmtree(output_folder / tar_names[0])
 
     @staticmethod
-    def _download_model_files(file_urls, output_folder, progress_bar):
+    def _download_model_files(
+        file_urls: list[str], output_folder: Union[str, os.PathLike[Any]], progress_bar: bool
+    ) -> None:
         """Download the github releases"""
+        output_folder = Path(output_folder)
         for file_url in file_urls:
             # download the file
             r = requests.get(file_url, stream=True)
             # extract the file
-            bease_filename = file_url.split("/")[-1]
-            temp_zip_name = os.path.join(output_folder, bease_filename)
+            base_filename = file_url.split("/")[-1]
+            file_path = output_folder / base_filename
             total_size_in_bytes = int(r.headers.get("content-length", 0))
             block_size = 1024  # 1 Kibibyte
-            with open(temp_zip_name, "wb") as file:
+            with open(file_path, "wb") as f:
                 if progress_bar:
                     ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
                 for data in r.iter_content(block_size):
                     if progress_bar:
                         ModelManager.tqdm_progress.update(len(data))
-                    file.write(data)
-
-    @staticmethod
-    def _check_dict_key(my_dict, key):
-        if key in my_dict.keys() and my_dict[key] is not None:
-            if not isinstance(key, str):
-                return True
-            if isinstance(key, str) and len(my_dict[key]) > 0:
-                return True
-        return False
+                    f.write(data)
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index a9b9feffc1..fafeddfd75 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -2,7 +2,7 @@
 import os
 import time
 from pathlib import Path
-from typing import List
+from typing import Any, List, Optional, Union
 
 import numpy as np
 import pysbd
@@ -16,6 +16,7 @@
 from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.audio.numpy_transforms import save_wav
+from TTS.utils.generic_utils import optional_to_str
 from TTS.vc.configs.openvoice_config import OpenVoiceConfig
 from TTS.vc.models import setup_model as setup_vc_model
 from TTS.vc.models.openvoice import OpenVoice
@@ -29,18 +30,18 @@ class Synthesizer(nn.Module):
     def __init__(
         self,
         *,
-        tts_checkpoint: str = "",
-        tts_config_path: str = "",
-        tts_speakers_file: str = "",
-        tts_languages_file: str = "",
-        vocoder_checkpoint: str = "",
-        vocoder_config: str = "",
-        encoder_checkpoint: str = "",
-        encoder_config: str = "",
-        vc_checkpoint: str = "",
-        vc_config: str = "",
-        model_dir: str = "",
-        voice_dir: str = None,
+        tts_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None,
+        tts_config_path: Optional[Union[str, os.PathLike[Any]]] = None,
+        tts_speakers_file: Optional[Union[str, os.PathLike[Any]]] = None,
+        tts_languages_file: Optional[Union[str, os.PathLike[Any]]] = None,
+        vocoder_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None,
+        vocoder_config: Optional[Union[str, os.PathLike[Any]]] = None,
+        encoder_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None,
+        encoder_config: Optional[Union[str, os.PathLike[Any]]] = None,
+        vc_checkpoint: Optional[Union[str, os.PathLike[Any]]] = None,
+        vc_config: Optional[Union[str, os.PathLike[Any]]] = None,
+        model_dir: Optional[Union[str, os.PathLike[Any]]] = None,
+        voice_dir: Optional[Union[str, os.PathLike[Any]]] = None,
         use_cuda: bool = False,
     ) -> None:
         """General 🐸 TTS interface for inference. It takes a tts and a vocoder
@@ -66,16 +67,17 @@ def __init__(
             use_cuda (bool, optional): enable/disable cuda. Defaults to False.
         """
         super().__init__()
-        self.tts_checkpoint = tts_checkpoint
-        self.tts_config_path = tts_config_path
-        self.tts_speakers_file = tts_speakers_file
-        self.tts_languages_file = tts_languages_file
-        self.vocoder_checkpoint = vocoder_checkpoint
-        self.vocoder_config = vocoder_config
-        self.encoder_checkpoint = encoder_checkpoint
-        self.encoder_config = encoder_config
-        self.vc_checkpoint = vc_checkpoint
-        self.vc_config = vc_config
+        self.tts_checkpoint = optional_to_str(tts_checkpoint)
+        self.tts_config_path = optional_to_str(tts_config_path)
+        self.tts_speakers_file = optional_to_str(tts_speakers_file)
+        self.tts_languages_file = optional_to_str(tts_languages_file)
+        self.vocoder_checkpoint = optional_to_str(vocoder_checkpoint)
+        self.vocoder_config = optional_to_str(vocoder_config)
+        self.encoder_checkpoint = optional_to_str(encoder_checkpoint)
+        self.encoder_config = optional_to_str(encoder_config)
+        self.vc_checkpoint = optional_to_str(vc_checkpoint)
+        self.vc_config = optional_to_str(vc_config)
+        model_dir = optional_to_str(model_dir)
         self.use_cuda = use_cuda
 
         self.tts_model = None
@@ -94,13 +96,13 @@ def __init__(
             assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
 
         if tts_checkpoint:
-            self._load_tts(tts_checkpoint, tts_config_path, use_cuda)
+            self._load_tts(self.tts_checkpoint, self.tts_config_path, use_cuda)
 
-        if vocoder_checkpoint:
-            self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda)
+        if vc_checkpoint and model_dir == "":
+            self._load_vc(self.vc_checkpoint, self.vc_config, use_cuda)
 
-        if vc_checkpoint and model_dir is None:
-            self._load_vc(vc_checkpoint, vc_config, use_cuda)
+        if vocoder_checkpoint:
+            self._load_vocoder(self.vocoder_checkpoint, self.vocoder_config, use_cuda)
 
         if model_dir:
             if "fairseq" in model_dir:
@@ -137,7 +139,9 @@ def _load_vc(self, vc_checkpoint: str, vc_config_path: str, use_cuda: bool) -> N
         """
         # pylint: disable=global-statement
         self.vc_config = load_config(vc_config_path)
-        self.output_sample_rate = self.vc_config.audio["output_sample_rate"]
+        self.output_sample_rate = self.vc_config.audio.get(
+            "output_sample_rate", self.vc_config.audio.get("sample_rate", None)
+        )
         self.vc_model = setup_vc_model(config=self.vc_config)
         self.vc_model.load_checkpoint(self.vc_config, vc_checkpoint)
         if use_cuda:
@@ -270,9 +274,21 @@ def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None:
             wav = np.array(wav)
         save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out)
 
-    def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]:
-        output_wav = self.vc_model.voice_conversion(source_wav, target_wav)
-        return output_wav
+    def voice_conversion(self, source_wav: str, target_wav: Union[str, list[str]], **kwargs) -> List[int]:
+        start_time = time.time()
+
+        if not isinstance(target_wav, list):
+            target_wav = [target_wav]
+        output = self.vc_model.voice_conversion(source_wav, target_wav, **kwargs)
+        if self.vocoder_model is not None:
+            output = self.vocoder_model.inference(output)
+
+        output = output.squeeze()
+        process_time = time.time() - start_time
+        audio_time = len(output) / self.output_sample_rate
+        logger.info("Processing time: %.3f", process_time)
+        logger.info("Real-time factor: %.3f", process_time / audio_time)
+        return output
 
     def tts(
         self,
diff --git a/TTS/vc/configs/knnvc_config.py b/TTS/vc/configs/knnvc_config.py
new file mode 100644
index 0000000000..7728ea0a9b
--- /dev/null
+++ b/TTS/vc/configs/knnvc_config.py
@@ -0,0 +1,59 @@
+from dataclasses import dataclass, field
+
+from coqpit import Coqpit
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.vc.configs.shared_configs import BaseVCConfig
+
+
+@dataclass
+class KNNVCAudioConfig(BaseAudioConfig):
+    """Audio configuration.
+
+    Args:
+        sample_rate (int):
+            The sampling rate of the input waveform.
+    """
+
+    sample_rate: int = field(default=16000)
+
+
+@dataclass
+class KNNVCArgs(Coqpit):
+    """Model arguments.
+
+    Args:
+        ssl_dim (int):
+            The dimension of the self-supervised learning embedding.
+    """
+
+    ssl_dim: int = field(default=1024)
+
+
+@dataclass
+class KNNVCConfig(BaseVCConfig):
+    """Parameters.
+
+    Args:
+        model (str):
+            Model name. Do not change unless you know what you are doing.
+
+        model_args (KNNVCArgs):
+            Model architecture arguments. Defaults to `KNNVCArgs()`.
+
+        audio (KNNVCAudioConfig):
+            Audio processing configuration. Defaults to `KNNVCAudioConfig()`.
+
+        wavlm_layer (int):
+            WavLM layer to use for feature extraction.
+
+        topk (int):
+            k in the kNN -- the number of nearest neighbors to average over
+    """
+
+    model: str = "knnvc"
+    model_args: KNNVCArgs = field(default_factory=KNNVCArgs)
+    audio: KNNVCAudioConfig = field(default_factory=KNNVCAudioConfig)
+
+    wavlm_layer: int = 6
+    topk: int = 4
diff --git a/TTS/vc/configs/shared_configs.py b/TTS/vc/configs/shared_configs.py
index b2fe63d29d..3c6b1a32cf 100644
--- a/TTS/vc/configs/shared_configs.py
+++ b/TTS/vc/configs/shared_configs.py
@@ -6,7 +6,7 @@
 
 @dataclass
 class BaseVCConfig(BaseTrainingConfig):
-    """Shared parameters among all the tts models.
+    """Shared parameters among all the VC models.
 
     Args:
 
diff --git a/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py
index a6d5bcf942..62fae59bc1 100644
--- a/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py
+++ b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py
@@ -1,6 +1,6 @@
 import logging
 from time import perf_counter as timer
-from typing import List, Union
+from typing import List
 
 import numpy as np
 import torch
@@ -22,12 +22,8 @@
 
 
 class SpeakerEncoder(nn.Module):
-    def __init__(self, weights_fpath, device: Union[str, torch.device] = None):
-        """
-        :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda").
-        If None, defaults to cuda if it is available on your machine, otherwise the model will
-        run on cpu. Outputs are always returned on the cpu, as numpy arrays.
-        """
+    def __init__(self, weights_fpath):
+        """FreeVC speaker encoder."""
         super().__init__()
 
         # Define the network
@@ -35,13 +31,6 @@ def __init__(self, weights_fpath, device: Union[str, torch.device] = None):
         self.linear = nn.Linear(model_hidden_size, model_embedding_size)
         self.relu = nn.ReLU()
 
-        # Get the target device
-        if device is None:
-            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        elif isinstance(device, str):
-            device = torch.device(device)
-        self.device = device
-
         # Load the pretrained model'speaker weights
         # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt")
         # if not weights_fpath.exists():
@@ -52,8 +41,11 @@ def __init__(self, weights_fpath, device: Union[str, torch.device] = None):
         checkpoint = load_fsspec(weights_fpath, map_location="cpu")
 
         self.load_state_dict(checkpoint["model_state"], strict=False)
-        self.to(device)
-        logger.info("Loaded the voice encoder model on %s in %.2f seconds.", device.type, timer() - start)
+        logger.info("Loaded the voice encoder model in %.2f seconds.", timer() - start)
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
 
     def forward(self, mels: torch.FloatTensor):
         """
@@ -123,7 +115,7 @@ def compute_partial_slices(n_samples: int, rate, min_coverage):
 
         return wav_slices, mel_slices
 
-    def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75):
+    def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75) -> torch.Tensor:
         """
         Computes an embedding for a single utterance. The utterance is divided in partial
         utterances and an embedding is computed for each. The complete utterance embedding is the
@@ -143,8 +135,8 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_
         then the last partial utterance will be considered by zero-padding the audio. Otherwise,
         it will be discarded. If there aren't enough frames for one partial utterance,
         this parameter is ignored so that the function always returns at least one slice.
-        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
-        <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+        :return: the embedding as a float tensor of shape (model_embedding_size,). If
+        <return_partials> is True, the partial utterances as a float tensor of shape
         (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
         returned.
         """
@@ -160,11 +152,11 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_
         mels = np.array([mel[s] for s in mel_slices])
         with torch.no_grad():
             mels = torch.from_numpy(mels).to(self.device)
-            partial_embeds = self(mels).cpu().numpy()
+            partial_embeds = self(mels)
 
         # Compute the utterance embedding from the partial embeddings
-        raw_embed = np.mean(partial_embeds, axis=0)
-        embed = raw_embed / np.linalg.norm(raw_embed, 2)
+        raw_embed = partial_embeds.mean(dim=0)
+        embed = raw_embed / torch.norm(raw_embed, p=2)
 
         if return_partials:
             return embed, partial_embeds, wav_slices
@@ -177,7 +169,9 @@ def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
 
         :param wavs: list of wavs a numpy arrays of float32.
         :param kwargs: extra arguments to embed_utterance()
-        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
+        :return: the embedding as a float tensor of shape (model_embedding_size,).
         """
-        raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) for wav in wavs], axis=0)
-        return raw_embed / np.linalg.norm(raw_embed, 2)
+        raw_embed = torch.mean(
+            torch.stack([self.embed_utterance(wav, return_partials=False, **kwargs) for wav in wavs]), dim=0
+        )
+        return raw_embed / torch.norm(raw_embed, p=2)
diff --git a/TTS/vc/layers/freevc/wavlm/__init__.py b/TTS/vc/layers/freevc/wavlm/__init__.py
index 62f7e74aaf..d9c3858f89 100644
--- a/TTS/vc/layers/freevc/wavlm/__init__.py
+++ b/TTS/vc/layers/freevc/wavlm/__init__.py
@@ -13,7 +13,7 @@
 model_uri = "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/WavLM-Large.pt"
 
 
-def get_wavlm(device="cpu"):
+def get_wavlm(device="cpu") -> WavLM:
     """Download the model and return the model object."""
 
     output_path = get_user_data_dir("tts")
diff --git a/TTS/vc/layers/freevc/wavlm/wavlm.py b/TTS/vc/layers/freevc/wavlm/wavlm.py
index 775f3e5979..0247ec53c1 100644
--- a/TTS/vc/layers/freevc/wavlm/wavlm.py
+++ b/TTS/vc/layers/freevc/wavlm/wavlm.py
@@ -9,7 +9,7 @@
 
 import logging
 import math
-from typing import List, Optional, Tuple
+from typing import Any, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -322,7 +322,7 @@ def extract_features(
         ret_conv: bool = False,
         output_layer: Optional[int] = None,
         ret_layer_results: bool = False,
-    ):
+    ) -> tuple[torch.Tensor, dict[str, Any]]:
         if self.feature_grad_mult > 0:
             features = self.feature_extractor(source)
             if self.feature_grad_mult != 1.0:
diff --git a/TTS/vc/models/__init__.py b/TTS/vc/models/__init__.py
index a9807d7006..8151a0445e 100644
--- a/TTS/vc/models/__init__.py
+++ b/TTS/vc/models/__init__.py
@@ -1,15 +1,22 @@
 import importlib
 import logging
 import re
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union
+
+from TTS.vc.configs.shared_configs import BaseVCConfig
+from TTS.vc.models.base_vc import BaseVC
 
 logger = logging.getLogger(__name__)
 
 
-def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseVC":
+def setup_model(config: BaseVCConfig) -> BaseVC:
     logger.info("Using model: %s", config.model)
     # fetch the right model implementation.
-    if "model" in config and config["model"].lower() == "freevc":
+    if config["model"].lower() == "freevc":
         MyModel = importlib.import_module("TTS.vc.models.freevc").FreeVC
-        model = MyModel.init_from_config(config, samples)
-    return model
+    elif config["model"].lower() == "knnvc":
+        MyModel = importlib.import_module("TTS.vc.models.knnvc").KNNVC
+    else:
+        msg = f"Model {config.model} does not exist!"
+        raise ValueError(msg)
+    return MyModel.init_from_config(config)
diff --git a/TTS/vc/models/base_vc.py b/TTS/vc/models/base_vc.py
index 22ffd0095c..6f7fb192b0 100644
--- a/TTS/vc/models/base_vc.py
+++ b/TTS/vc/models/base_vc.py
@@ -37,7 +37,7 @@ class BaseVC(BaseTrainerModel):
     def __init__(
         self,
         config: Coqpit,
-        ap: AudioProcessor,
+        ap: Optional[AudioProcessor] = None,
         speaker_manager: Optional[SpeakerManager] = None,
         language_manager: Optional[LanguageManager] = None,
     ) -> None:
@@ -51,7 +51,7 @@ def __init__(
     def _set_model_args(self, config: Coqpit) -> None:
         """Setup model args based on the config type (`ModelConfig` or `ModelArgs`).
 
-        `ModelArgs` has all the fields reuqired to initialize the model architecture.
+        `ModelArgs` has all the fields required to initialize the model architecture.
 
         `ModelConfig` has all the fields required for training, inference and containes `ModelArgs`.
 
diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py
index c654219c39..104ad9ae6c 100644
--- a/TTS/vc/models/freevc.py
+++ b/TTS/vc/models/freevc.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import librosa
 import numpy as np
@@ -233,7 +233,7 @@ def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
 class FreeVC(BaseVC):
     """
 
-    Papaer::
+    Paper::
         https://arxiv.org/abs/2210.15418#
 
     Paper Abstract::
@@ -306,15 +306,11 @@ def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):
 
         self.wavlm = get_wavlm()
 
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
     def load_pretrained_speaker_encoder(self):
         """Load pretrained speaker encoder model as mentioned in the paper."""
         logger.info("Loading pretrained speaker encoder model ...")
         self.enc_spk_ex = SpeakerEncoderEx(
-            "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.pt", device=self.device
+            "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.pt"
         )
 
     def init_multispeaker(self, config: Coqpit):
@@ -389,8 +385,8 @@ def forward(
 
         return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
 
-    @torch.no_grad()
-    def inference(self, c, g=None, mel=None, c_lengths=None):
+    @torch.inference_mode()
+    def inference(self, c, g=None, c_lengths=None):
         """
         Inference pass of the model
 
@@ -405,9 +401,6 @@ def inference(self, c, g=None, mel=None, c_lengths=None):
         """
         if c_lengths is None:
             c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
-        if not self.use_spk:
-            g = self.enc_spk.embed_utterance(mel)
-            g = g.unsqueeze(-1)
         z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
         z = self.flow(z_p, c_mask, g=g, reverse=True)
         o = self.dec(z * c_mask, g=g)
@@ -438,51 +431,52 @@ def load_audio(self, wav):
         return wav.float()
 
     @torch.inference_mode()
-    def voice_conversion(self, src, tgt):
+    def voice_conversion(self, src: Union[str, torch.Tensor], tgt: list[Union[str, torch.Tensor]]):
         """
         Voice conversion pass of the model.
 
         Args:
             src (str or torch.Tensor): Source utterance.
-            tgt (str or torch.Tensor): Target utterance.
+            tgt (list of str or torch.Tensor): Target utterances.
 
         Returns:
             torch.Tensor: Output tensor.
         """
 
-        wav_tgt = self.load_audio(tgt).cpu().numpy()
-        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
-
-        if self.config.model_args.use_spk:
-            g_tgt = self.enc_spk_ex.embed_utterance(wav_tgt)
-            g_tgt = torch.from_numpy(g_tgt)[None, :, None].to(self.device)
-        else:
-            wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(self.device)
-            mel_tgt = mel_spectrogram_torch(
-                wav_tgt,
-                self.config.audio.filter_length,
-                self.config.audio.n_mel_channels,
-                self.config.audio.input_sample_rate,
-                self.config.audio.hop_length,
-                self.config.audio.win_length,
-                self.config.audio.mel_fmin,
-                self.config.audio.mel_fmax,
-            )
         # src
         wav_src = self.load_audio(src)
         c = self.extract_wavlm_features(wav_src[None, :])
 
-        if self.config.model_args.use_spk:
-            audio = self.inference(c, g=g_tgt)
-        else:
-            audio = self.inference(c, mel=mel_tgt.transpose(1, 2))
-        audio = audio[0][0].data.cpu().float().numpy()
-        return audio
+        # tgt
+        g_tgts = []
+        for tg in tgt:
+            wav_tgt = self.load_audio(tg).cpu().numpy()
+            wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+
+            if self.config.model_args.use_spk:
+                g_tgts.append(self.enc_spk_ex.embed_utterance(wav_tgt)[None, :, None])
+            else:
+                wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(self.device)
+                mel_tgt = mel_spectrogram_torch(
+                    wav_tgt,
+                    self.config.audio.filter_length,
+                    self.config.audio.n_mel_channels,
+                    self.config.audio.input_sample_rate,
+                    self.config.audio.hop_length,
+                    self.config.audio.win_length,
+                    self.config.audio.mel_fmin,
+                    self.config.audio.mel_fmax,
+                )
+                g_tgts.append(self.enc_spk.embed_utterance(mel_tgt.transpose(1, 2)).unsqueeze(-1))
+
+        g_tgt = torch.stack(g_tgts).mean(dim=0)
+        audio = self.inference(c, g=g_tgt)
+        return audio[0][0].data.cpu().float().numpy()
 
     def eval_step(): ...
 
     @staticmethod
-    def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None):
+    def init_from_config(config: FreeVCConfig) -> "FreeVC":
         model = FreeVC(config)
         return model
 
diff --git a/TTS/vc/models/knnvc.py b/TTS/vc/models/knnvc.py
new file mode 100644
index 0000000000..2f504704ef
--- /dev/null
+++ b/TTS/vc/models/knnvc.py
@@ -0,0 +1,182 @@
+import logging
+import os
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn.functional as F
+import torchaudio
+from coqpit import Coqpit
+from typing_extensions import TypeAlias
+
+from TTS.vc.configs.knnvc_config import KNNVCConfig
+from TTS.vc.layers.freevc.wavlm import get_wavlm
+from TTS.vc.models.base_vc import BaseVC
+
+logger = logging.getLogger(__name__)
+
+PathOrTensor: TypeAlias = Union[str, os.PathLike[Any], torch.Tensor]
+
+
+class KNNVC(BaseVC):
+    """
+    Paper::
+        https://arxiv.org/abs/2305.18975
+
+    Paper Abstract::
+        Any-to-any voice conversion aims to transform source speech
+        into a target voice with just a few examples of the target speaker as a
+        reference. Recent methods produce convincing conversions, but at the cost of
+        increased complexity -- making results difficult to reproduce and build on.
+        Instead, we keep it simple. We propose k-nearest neighbors voice conversion
+        (kNN-VC): a straightforward yet effective method for any-to-any conversion.
+        First, we extract self-supervised representations of the source and reference
+        speech. To convert to the target speaker, we replace each frame of the source
+        representation with its nearest neighbor in the reference. Finally, a pretrained
+        vocoder synthesizes audio from the converted representation. Objective and
+        subjective evaluations show that kNN-VC improves speaker similarity with similar
+        intelligibility scores to existing methods.
+
+    Samples::
+        https://bshall.github.io/knn-vc
+
+    Original code::
+        https://github.com/bshall/knn-vc
+
+    Examples:
+        >>> from TTS.vc.configs.knnvc_config import KNNVCConfig
+        >>> from TTS.vc.models.knnvc import KNNVC
+        >>> config = KNNVCConfig()
+        >>> model = KNNVC(config)
+    """
+
+    def __init__(self, config: Coqpit):
+        super().__init__(config)
+        self.ssl_dim = self.args.ssl_dim
+        self.wavlm = get_wavlm()
+
+    @staticmethod
+    def init_from_config(config: KNNVCConfig) -> "KNNVC":
+        return KNNVC(config)
+
+    @torch.inference_mode()
+    def get_features(self, audio: PathOrTensor, vad_trigger_level=0) -> torch.Tensor:
+        """Return features for the given waveform with output shape (seq_len, dim).
+
+        Optionally perform VAD trimming on start/end with `vad_trigger_level`.
+        """
+        # load audio
+        if isinstance(audio, torch.Tensor):
+            x: torch.Tensor = audio
+            sr = self.config.audio.sample_rate
+            if x.dim() == 1:
+                x = x[None]
+        else:
+            x, sr = torchaudio.load(audio, normalize=True)
+
+        if not sr == self.config.audio.sample_rate:
+            logger.info(f"Resampling {sr} to {self.config.audio.sample_rate} in {audio}")
+            x = torchaudio.functional.resample(x, orig_freq=sr, new_freq=self.config.audio.sample_rate)
+            sr = self.config.audio.sample_rate
+
+        # trim silence from front and back
+        if vad_trigger_level > 1e-3:
+            transform = torchaudio.transforms.Vad(sample_rate=sr, trigger_level=vad_trigger_level)
+            x_front_trim = transform(x)
+            waveform_reversed = torch.flip(x_front_trim, (-1,))
+            waveform_reversed_front_trim = transform(waveform_reversed)
+            x = torch.flip(waveform_reversed_front_trim, (-1,))
+
+        # extract the representation of each layer
+        wav_input_16khz = x.to(self.device)
+        features = self.wavlm.extract_features(
+            wav_input_16khz, output_layer=self.config.wavlm_layer, ret_layer_results=False
+        )[0]
+        return features.squeeze(0)
+
+    def get_matching_set(self, wavs: list[PathOrTensor], vad_trigger_level=7) -> torch.Tensor:
+        """Get concatenated wavlm features for the matching set using all waveforms in `wavs`.
+
+        Wavs are specified as either a list of paths or list of loaded waveform tensors of
+        shape (channels, T), assumed to be of 16kHz sample rate.
+        """
+        feats = []
+        for p in wavs:
+            feats.append(self.get_features(p, vad_trigger_level=vad_trigger_level))
+
+        feats = torch.concat(feats, dim=0).cpu()
+        return feats
+
+    @staticmethod
+    def fast_cosine_dist(source_feats: torch.Tensor, matching_pool: torch.Tensor) -> torch.Tensor:
+        """Like torch.cdist, but fixed dim=-1 and for cosine distance."""
+        source_norms = torch.norm(source_feats, p=2, dim=-1)
+        matching_norms = torch.norm(matching_pool, p=2, dim=-1)
+        dotprod = (
+            -(torch.cdist(source_feats[None], matching_pool[None], p=2)[0] ** 2)
+            + source_norms[:, None] ** 2
+            + matching_norms[None] ** 2
+        )
+        dotprod /= 2
+
+        dists = 1 - (dotprod / (source_norms[:, None] * matching_norms[None]))
+        return dists
+
+    @torch.inference_mode()
+    def match(
+        self,
+        query_seq: torch.Tensor,
+        matching_set: torch.Tensor,
+        synth_set: Optional[torch.Tensor] = None,
+        topk: Optional[int] = None,
+        target_duration: Optional[float] = None,
+    ) -> torch.Tensor:
+        """Given `query_seq`, `matching_set`, and `synth_set` tensors of shape (N, dim), perform kNN regression matching
+        with k=`topk`.
+
+        Args:
+            `query_seq`: Tensor (N1, dim) of the input/source query features.
+            `matching_set`: Tensor (N2, dim) of the matching set used as the 'training set' for the kNN algorithm.
+            `synth_set`: optional Tensor (N2, dim) corresponding to the matching set. We use the matching set to assign
+                         each query vector to a vector in the matching set, and then use the corresponding vector from
+                         the synth set during HiFiGAN synthesis.
+                         By default, and for best performance, this should be identical to the matching set.
+            `topk`: k in the kNN -- the number of nearest neighbors to average over.
+            `target_duration`: if set to a float, interpolate waveform duration to be equal to this value in seconds.
+
+        Returns:
+            - converted features (1, N, dim)
+        """
+        if topk is None:
+            topk = self.config.topk
+        synth_set = matching_set.to(self.device) if synth_set is None else synth_set.to(self.device)
+        matching_set = matching_set.to(self.device)
+        query_seq = query_seq.to(self.device)
+
+        if target_duration is not None:
+            target_samples = int(target_duration * self.config.audio.sample_rate)
+            scale_factor = (target_samples / self.hop_length) / query_seq.shape[0]  # n_targ_feats / n_input_feats
+            query_seq = F.interpolate(query_seq.T[None], scale_factor=scale_factor, mode="linear")[0].T
+
+        dists = self.fast_cosine_dist(query_seq, matching_set)
+        best = dists.topk(k=topk, largest=False, dim=-1)
+        out_feats = synth_set[best.indices].mean(dim=1)
+        return out_feats.unsqueeze(0)
+
+    def load_checkpoint(self, vc_config: KNNVCConfig, _vc_checkpoint: Union[str, os.PathLike[Any]]) -> None:
+        """kNN-VC does not use checkpoints."""
+
+    def forward(self) -> None: ...
+    def inference(self) -> None: ...
+
+    @torch.inference_mode()
+    def voice_conversion(
+        self,
+        source: PathOrTensor,
+        target: list[PathOrTensor],
+        topk: Optional[int] = None,
+    ) -> torch.Tensor:
+        if not isinstance(target, list):
+            target = [target]
+        source_features = self.get_features(source)
+        matching_set = self.get_matching_set(target)
+        return self.match(source_features, matching_set, topk=topk)
diff --git a/TTS/vc/models/openvoice.py b/TTS/vc/models/openvoice.py
index 135b0861b9..3cb37e64b5 100644
--- a/TTS/vc/models/openvoice.py
+++ b/TTS/vc/models/openvoice.py
@@ -174,10 +174,6 @@ def __init__(self, config: Coqpit, speaker_manager: Optional[SpeakerManager] = N
 
         self.ref_enc = ReferenceEncoder(self.spec_channels, self.gin_channels)
 
-    @property
-    def device(self) -> torch.device:
-        return next(self.parameters()).device
-
     @staticmethod
     def init_from_config(config: OpenVoiceConfig) -> "OpenVoice":
         return OpenVoice(config)
@@ -226,9 +222,9 @@ def eval_step(self) -> None: ...
     def _set_x_lengths(x: torch.Tensor, aux_input: Mapping[str, Optional[torch.Tensor]]) -> torch.Tensor:
         if "x_lengths" in aux_input and aux_input["x_lengths"] is not None:
             return aux_input["x_lengths"]
-        return torch.tensor(x.shape[1:2]).to(x.device)
+        return torch.tensor(x.shape[-1:]).to(x.device)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(
         self,
         x: torch.Tensor,
@@ -284,8 +280,7 @@ def load_audio(self, wav: Union[str, npt.NDArray[np.float32], torch.Tensor, list
         return out.to(self.device).float()
 
     def extract_se(self, audio: Union[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
-        audio_ref = self.load_audio(audio)
-        y = torch.FloatTensor(audio_ref)
+        y = self.load_audio(audio)
         y = y.to(self.device)
         y = y.unsqueeze(0)
         spec = wav_to_spec(
@@ -301,19 +296,25 @@ def extract_se(self, audio: Union[str, torch.Tensor]) -> tuple[torch.Tensor, tor
         return g, spec
 
     @torch.inference_mode()
-    def voice_conversion(self, src: Union[str, torch.Tensor], tgt: Union[str, torch.Tensor]) -> npt.NDArray[np.float32]:
+    def voice_conversion(
+        self, src: Union[str, torch.Tensor], tgt: list[Union[str, torch.Tensor]]
+    ) -> npt.NDArray[np.float32]:
         """
         Voice conversion pass of the model.
 
         Args:
             src (str or torch.Tensor): Source utterance.
-            tgt (str or torch.Tensor): Target utterance.
+            tgt (list of str or torch.Tensor): Target utterance.
 
         Returns:
             Output numpy array.
         """
         src_se, src_spec = self.extract_se(src)
-        tgt_se, _ = self.extract_se(tgt)
+        tgt_ses = []
+        for tg in tgt:
+            tgt_se, _ = self.extract_se(tg)
+            tgt_ses.append(tgt_se)
+        tgt_se = torch.stack(tgt_ses).mean(dim=0)
 
         aux_input = {"g_src": src_se, "g_tgt": tgt_se}
         audio = self.inference(src_spec, aux_input)
diff --git a/TTS/vocoder/configs/hifigan_config.py b/TTS/vocoder/configs/hifigan_config.py
index 9a102f0c89..60dde496b2 100644
--- a/TTS/vocoder/configs/hifigan_config.py
+++ b/TTS/vocoder/configs/hifigan_config.py
@@ -5,7 +5,7 @@
 
 @dataclass
 class HifiganConfig(BaseGANVocoderConfig):
-    """Defines parameters for FullBand MelGAN vocoder.
+    """Defines parameters for HifiGAN vocoder.
 
     Example:
 
diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py
index b6a1850484..481d234a54 100644
--- a/TTS/vocoder/models/__init__.py
+++ b/TTS/vocoder/models/__init__.py
@@ -5,11 +5,13 @@
 from coqpit import Coqpit
 
 from TTS.utils.generic_utils import to_camel
+from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig, BaseVocoderConfig
+from TTS.vocoder.models.base_vocoder import BaseVocoder
 
 logger = logging.getLogger(__name__)
 
 
-def setup_model(config: Coqpit):
+def setup_model(config: BaseVocoderConfig) -> BaseVocoder:
     """Load models directly from configuration."""
     if "discriminator_model" in config and "generator_model" in config:
         MyModel = importlib.import_module("TTS.vocoder.models.gan")
@@ -26,19 +28,20 @@ def setup_model(config: Coqpit):
             try:
                 MyModel = getattr(MyModel, to_camel(config.model))
             except ModuleNotFoundError as e:
-                raise ValueError(f"Model {config.model} not exist!") from e
+                raise ValueError(f"Model {config.model} does not exist!") from e
     logger.info("Vocoder model: %s", config.model)
     return MyModel.init_from_config(config)
 
 
-def setup_generator(c):
+def setup_generator(c: BaseGANVocoderConfig):
     """TODO: use config object as arguments"""
     logger.info("Generator model: %s", c.generator_model)
     MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower())
     MyModel = getattr(MyModel, to_camel(c.generator_model))
     # this is to preserve the Wavernn class name (instead of Wavernn)
     if c.generator_model.lower() in "hifigan_generator":
-        model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params)
+        c.generator_model_params["in_channels"] = c.generator_model_params.get("in_channels", c.audio["num_mels"])
+        model = MyModel(out_channels=1, **c.generator_model_params)
     elif c.generator_model.lower() in "melgan_generator":
         model = MyModel(
             in_channels=c.audio["num_mels"],
@@ -94,8 +97,8 @@ def setup_generator(c):
     return model
 
 
-def setup_discriminator(c):
-    """TODO: use config objekt as arguments"""
+def setup_discriminator(c: BaseGANVocoderConfig):
+    """TODO: use config object as arguments"""
     logger.info("Discriminator model: %s", c.discriminator_model)
     if "parallel_wavegan" in c.discriminator_model:
         MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator")
@@ -104,7 +107,7 @@ def setup_discriminator(c):
     MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower()))
     if c.discriminator_model in "hifigan_discriminator":
         model = MyModel()
-    if c.discriminator_model in "random_window_discriminator":
+    elif c.discriminator_model in "random_window_discriminator":
         model = MyModel(
             cond_channels=c.audio["num_mels"],
             hop_length=c.audio["hop_length"],
@@ -113,7 +116,7 @@ def setup_discriminator(c):
             cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"],
             window_sizes=c.discriminator_model_params["window_sizes"],
         )
-    if c.discriminator_model in "melgan_multiscale_discriminator":
+    elif c.discriminator_model in "melgan_multiscale_discriminator":
         model = MyModel(
             in_channels=1,
             out_channels=1,
@@ -122,7 +125,7 @@ def setup_discriminator(c):
             max_channels=c.discriminator_model_params["max_channels"],
             downsample_factors=c.discriminator_model_params["downsample_factors"],
         )
-    if c.discriminator_model == "residual_parallel_wavegan_discriminator":
+    elif c.discriminator_model == "residual_parallel_wavegan_discriminator":
         model = MyModel(
             in_channels=1,
             out_channels=1,
@@ -137,7 +140,7 @@ def setup_discriminator(c):
             nonlinear_activation="LeakyReLU",
             nonlinear_activation_params={"negative_slope": 0.2},
         )
-    if c.discriminator_model == "parallel_wavegan_discriminator":
+    elif c.discriminator_model == "parallel_wavegan_discriminator":
         model = MyModel(
             in_channels=1,
             out_channels=1,
@@ -149,6 +152,8 @@ def setup_discriminator(c):
             nonlinear_activation_params={"negative_slope": 0.2},
             bias=True,
         )
-    if c.discriminator_model == "univnet_discriminator":
+    elif c.discriminator_model == "univnet_discriminator":
         model = MyModel()
+    else:
+        raise NotImplementedError(f"Model {c.discriminator_model} not implemented!")
     return model
diff --git a/TTS/vocoder/models/fullband_melgan_generator.py b/TTS/vocoder/models/fullband_melgan_generator.py
index ee25559af0..292d3323bb 100644
--- a/TTS/vocoder/models/fullband_melgan_generator.py
+++ b/TTS/vocoder/models/fullband_melgan_generator.py
@@ -24,7 +24,7 @@ def __init__(
             num_res_blocks=num_res_blocks,
         )
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, cond_features):
         cond_features = cond_features.to(self.layers[1].weight.device)
         cond_features = torch.nn.functional.pad(
diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py
index 8792950a56..7785d8011c 100644
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@@ -212,7 +212,7 @@ def train_log(
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]:
         """Call `train_step()` with `no_grad()`"""
         self.train_disc = True  # Avoid a bug in the Training with the missing discriminator loss
diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py
index 8273d02037..b2100c55b1 100644
--- a/TTS/vocoder/models/hifigan_generator.py
+++ b/TTS/vocoder/models/hifigan_generator.py
@@ -179,6 +179,7 @@ def __init__(
         conv_post_weight_norm=True,
         conv_post_bias=True,
         cond_in_each_up_layer=False,
+        pre_linear=None,
     ):
         r"""HiFiGAN Generator with Multi-Receptive Field Fusion (MRF)
 
@@ -198,6 +199,7 @@ def __init__(
                 for each consecutive upsampling layer.
             upsample_factors (List[int]): upsampling factors (stride) for each upsampling layer.
             inference_padding (int): constant padding applied to the input at inference time. Defaults to 5.
+            pre_linear (int): If not None, add nn.Linear(pre_linear, in_channels) before the convolutions.
         """
         super().__init__()
         self.inference_padding = inference_padding
@@ -206,6 +208,8 @@ def __init__(
         self.cond_in_each_up_layer = cond_in_each_up_layer
 
         # initial upsampling layers
+        if pre_linear is not None:
+            self.lin_pre = nn.Linear(pre_linear, in_channels)
         self.conv_pre = weight_norm(Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3))
         resblock = ResBlock1 if resblock_type == "1" else ResBlock2
         # upsampling layers
@@ -258,6 +262,9 @@ def forward(self, x, g=None):
             x: [B, C, T]
             Tensor: [B, 1, T]
         """
+        if hasattr(self, "lin_pre"):
+            x = self.lin_pre(x)
+            x = x.permute(0, 2, 1)
         o = self.conv_pre(x)
         if hasattr(self, "cond_layer"):
             o = o + self.cond_layer(g)
@@ -280,7 +287,7 @@ def forward(self, x, g=None):
         o = torch.tanh(o)
         return o
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, c):
         """
         Args:
diff --git a/TTS/vocoder/models/multiband_melgan_generator.py b/TTS/vocoder/models/multiband_melgan_generator.py
index 25d6590659..6eee712db3 100644
--- a/TTS/vocoder/models/multiband_melgan_generator.py
+++ b/TTS/vocoder/models/multiband_melgan_generator.py
@@ -32,7 +32,7 @@ def pqmf_analysis(self, x):
     def pqmf_synthesis(self, x):
         return self.pqmf_layer.synthesis(x)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, cond_features):
         cond_features = cond_features.to(self.layers[1].weight.device)
         cond_features = torch.nn.functional.pad(
diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py
index e60c8781f0..0659a00cc1 100644
--- a/TTS/vocoder/models/parallel_wavegan_generator.py
+++ b/TTS/vocoder/models/parallel_wavegan_generator.py
@@ -127,7 +127,7 @@ def forward(self, c):
 
         return x
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, c):
         c = c.to(self.first_conv.weight.device)
         c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate")
diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py
index 5d1f817927..19f5648f4d 100644
--- a/TTS/vocoder/models/univnet_generator.py
+++ b/TTS/vocoder/models/univnet_generator.py
@@ -139,7 +139,7 @@ def receptive_field_size(self):
         """Return receptive field size."""
         return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, c):
         """Perform inference.
         Args:
diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py
index c49abd2201..d756f956dd 100644
--- a/TTS/vocoder/models/wavegrad.py
+++ b/TTS/vocoder/models/wavegrad.py
@@ -123,7 +123,7 @@ def load_noise_schedule(self, path):
         beta = np.load(path, allow_pickle=True).item()["beta"]  # pylint: disable=unexpected-keyword-arg
         self.compute_noise_level(beta)
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def inference(self, x, y_n=None):
         """
         Shapes:
@@ -262,7 +262,7 @@ def train_log(  # pylint: disable=no-self-use
     ) -> Tuple[Dict, np.ndarray]:
         pass
 
-    @torch.no_grad()
+    @torch.inference_mode()
     def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
         return self.train_step(batch, criterion)
 
diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py
index 1847679890..4ece55af62 100644
--- a/TTS/vocoder/models/wavernn.py
+++ b/TTS/vocoder/models/wavernn.py
@@ -307,7 +307,7 @@ def inference(self, mels, batched=None, target=None, overlap=None):
         rnn1 = self.get_gru_cell(self.rnn1)
         rnn2 = self.get_gru_cell(self.rnn2)
 
-        with torch.no_grad():
+        with torch.inference_mode():
             if isinstance(mels, np.ndarray):
                 mels = torch.FloatTensor(mels).to(str(next(self.parameters()).device))
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e7d36c1f43..e878d0e8f9 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -52,6 +52,7 @@
     "sphinx_inline_tabs",
 ]
 
+suppress_warnings = ["autosectionlabel.*"]
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
@@ -67,6 +68,8 @@
     "linkify",
 ]
 
+myst_heading_anchors = 4
+
 # 'sphinxcontrib.katex',
 # 'sphinx.ext.autosectionlabel',
 
diff --git a/docs/source/configuration.md b/docs/source/configuration.md
index ada61e16db..220c96c363 100644
--- a/docs/source/configuration.md
+++ b/docs/source/configuration.md
@@ -1,6 +1,6 @@
 # Configuration
 
-We use 👩‍✈️[Coqpit] for configuration management. It provides basic static type checking and serialization capabilities on top of native Python `dataclasses`. Here is how a simple configuration looks like with Coqpit.
+We use 👩‍✈️[Coqpit](https://github.com/idiap/coqui-ai-coqpit) for configuration management. It provides basic static type checking and serialization capabilities on top of native Python `dataclasses`. Here is how a simple configuration looks like with Coqpit.
 
 ```python
 from dataclasses import asdict, dataclass, field
@@ -36,7 +36,7 @@ class SimpleConfig(Coqpit):
         check_argument("val_c", c, restricted=True)
 ```
 
-In TTS, each model must have a configuration class that exposes all the values necessary for its lifetime.
+In Coqui, each model must have a configuration class that exposes all the values necessary for its lifetime.
 
 It defines model architecture, hyper-parameters, training, and inference settings. For our models, we merge all the fields in a single configuration class for ease. It may not look like a wise practice but enables easier bookkeeping and reproducible experiments.
 
diff --git a/docs/source/formatting_your_dataset.md b/docs/source/datasets/formatting_your_dataset.md
similarity index 95%
rename from docs/source/formatting_your_dataset.md
rename to docs/source/datasets/formatting_your_dataset.md
index 23c497d0bf..e92263339e 100644
--- a/docs/source/formatting_your_dataset.md
+++ b/docs/source/datasets/formatting_your_dataset.md
@@ -1,7 +1,9 @@
 (formatting_your_dataset)=
-# Formatting Your Dataset
+# Formatting your dataset
 
-For training a TTS model, you need a dataset with speech recordings and transcriptions. The speech must be divided into audio clips and each clip needs transcription.
+For training a TTS model, you need a dataset with speech recordings and
+transcriptions. The speech must be divided into audio clips and each clip needs
+a transcription.
 
 If you have a single audio file and you need to split it into clips, there are different open-source tools for you. We recommend Audacity. It is an open-source and free audio editing software.
 
@@ -49,7 +51,7 @@ The format above is taken from widely-used the [LJSpeech](https://keithito.com/L
 
 Your dataset should have good coverage of the target language. It should cover the phonemic variety, exceptional sounds and syllables. This is extremely important for especially non-phonemic languages like English.
 
-For more info about dataset qualities and properties check our [post](https://github.com/coqui-ai/TTS/wiki/What-makes-a-good-TTS-dataset).
+For more info about dataset qualities and properties check [this page](what_makes_a_good_dataset.md).
 
 ## Using Your Dataset in 🐸TTS
 
diff --git a/docs/source/datasets/index.md b/docs/source/datasets/index.md
new file mode 100644
index 0000000000..6b040fc416
--- /dev/null
+++ b/docs/source/datasets/index.md
@@ -0,0 +1,12 @@
+# Datasets
+
+For training a TTS model, you need a dataset with speech recordings and
+transcriptions. See the following pages for more information on:
+
+```{toctree}
+:maxdepth: 1
+
+formatting_your_dataset
+what_makes_a_good_dataset
+tts_datasets
+```
diff --git a/docs/source/tts_datasets.md b/docs/source/datasets/tts_datasets.md
similarity index 90%
rename from docs/source/tts_datasets.md
rename to docs/source/datasets/tts_datasets.md
index 11da1b7688..df8d2f2ad9 100644
--- a/docs/source/tts_datasets.md
+++ b/docs/source/datasets/tts_datasets.md
@@ -1,6 +1,6 @@
-# TTS Datasets
+# Public TTS datasets
 
-Some of the known public datasets that we successfully applied 🐸TTS:
+Some of the known public datasets that were successfully used for 🐸TTS:
 
 - [English - LJ Speech](https://keithito.com/LJ-Speech-Dataset/)
 - [English - Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/)
diff --git a/docs/source/what_makes_a_good_dataset.md b/docs/source/datasets/what_makes_a_good_dataset.md
similarity index 100%
rename from docs/source/what_makes_a_good_dataset.md
rename to docs/source/datasets/what_makes_a_good_dataset.md
diff --git a/docs/source/docker_images.md b/docs/source/docker_images.md
index 58d961203e..ef98fe302e 100644
--- a/docs/source/docker_images.md
+++ b/docs/source/docker_images.md
@@ -1,20 +1,20 @@
 (docker_images)=
-## Docker images
+# Docker images
 We provide docker images to be able to test TTS without having to setup your own environment.
 
-### Using premade images
+## Using premade images
 You can use premade images built automatically from the latest TTS version.
 
-#### CPU version
+### CPU version
 ```bash
-docker pull ghcr.io/coqui-ai/tts-cpu
+docker pull ghcr.io/idiap/coqui-tts-cpu
 ```
-#### GPU version
+### GPU version
 ```bash
-docker pull ghcr.io/coqui-ai/tts
+docker pull ghcr.io/idiap/coqui-tts
 ```
 
-### Building your own image
+## Building your own image
 ```bash
 docker build -t tts .
 ```
@@ -25,14 +25,14 @@ You can pass any tts argument after the image name.
 
 ### CPU version
 ```bash
-docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav
+docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/idiap/coqui-tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav
 ```
 ### GPU version
 For the GPU version, you need to have the latest NVIDIA drivers installed.
 With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8
 
 ```bash
-docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda
+docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/idiap/coqui-tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda
 ```
 
 ## Start a server
@@ -41,14 +41,14 @@ Start the container and get a shell inside it.
 
 ### CPU version
 ```bash
-docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
+docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/idiap/coqui-tts-cpu
 python3 TTS/server/server.py --list_models #To get the list of available models
 python3 TTS/server/server.py --model_name tts_models/en/vctk/vits
 ```
 
 ### GPU version
 ```bash
-docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts
+docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/idiap/coqui-tts
 python3 TTS/server/server.py --list_models #To get the list of available models
 python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda
 ```
diff --git a/docs/source/implementing_a_new_language_frontend.md b/docs/source/extension/implementing_a_new_language_frontend.md
similarity index 88%
rename from docs/source/implementing_a_new_language_frontend.md
rename to docs/source/extension/implementing_a_new_language_frontend.md
index 2041352d64..0b3ef59be0 100644
--- a/docs/source/implementing_a_new_language_frontend.md
+++ b/docs/source/extension/implementing_a_new_language_frontend.md
@@ -1,6 +1,6 @@
-# Implementing a New Language Frontend
+# Implementing new language front ends
 
-- Language frontends are located under `TTS.tts.utils.text`
+- Language front ends are located under `TTS.tts.utils.text`
 - Each special language has a separate folder.
 - Each folder contains all the utilities for processing the text input.
 - `TTS.tts.utils.text.phonemizers` contains the main phonemizer for a language. This is the class that uses the utilities
diff --git a/docs/source/implementing_a_new_model.md b/docs/source/extension/implementing_a_new_model.md
similarity index 96%
rename from docs/source/implementing_a_new_model.md
rename to docs/source/extension/implementing_a_new_model.md
index 1bf7a8822e..188f466c72 100644
--- a/docs/source/implementing_a_new_model.md
+++ b/docs/source/extension/implementing_a_new_model.md
@@ -1,4 +1,4 @@
-# Implementing a Model
+# Implementing new models
 
 1. Implement layers.
 
@@ -36,7 +36,8 @@
     There is also the `callback` interface by which you can manipulate both the model and the `Trainer` states. Callbacks give you
     an infinite flexibility to add custom behaviours for your model and training routines.
 
-    For more details, see {ref}`BaseTTS <Base tts Model>` and :obj:`TTS.utils.callbacks`.
+    For more details, see [BaseTTS](../main_classes/model_api.md#base-tts-model)
+    and [`trainer.callbacks`](https://github.com/idiap/coqui-ai-Trainer/blob/main/trainer/callbacks.py).
 
 6. Optionally, define `MyModelArgs`.
 
@@ -62,7 +63,7 @@
     We love you more when you document your code. ❤️
 
 
-# Template 🐸TTS Model implementation
+## Template 🐸TTS Model implementation
 
 You can start implementing your model by copying the following base class.
 
diff --git a/docs/source/extension/index.md b/docs/source/extension/index.md
new file mode 100644
index 0000000000..39c36b632c
--- /dev/null
+++ b/docs/source/extension/index.md
@@ -0,0 +1,14 @@
+# Adding models or languages
+
+You can extend Coqui by implementing new model architectures or adding front
+ends for new languages. See the pages below for more details. The [project
+structure](../project_structure.md) and [contribution
+guidelines](../contributing.md) may also be helpful. Please open a pull request
+with your changes to share back the improvements with the community.
+
+```{toctree}
+:maxdepth: 1
+
+implementing_a_new_model
+implementing_a_new_language_frontend
+```
diff --git a/docs/source/faq.md b/docs/source/faq.md
index 1090aaa35c..4fbd149f00 100644
--- a/docs/source/faq.md
+++ b/docs/source/faq.md
@@ -1,28 +1,56 @@
-# Humble FAQ
-We tried to collect common issues and questions we receive about 🐸TTS. It is worth checking before going deeper.
+# FAQ
+We tried to collect common issues and questions we receive about 🐸TTS. It is
+worth checking before going deeper.
 
-## Errors with a pre-trained model. How can I resolve this?
-- Make sure you use the right commit version of 🐸TTS. Each pre-trained model has its corresponding version that needs to be used. It is defined on the model table.
-- If it is still problematic, post your problem on [Discussions](https://github.com/idiap/coqui-ai-TTS/discussions). Please give as many details as possible (error message, your TTS version, your TTS model and config.json etc.)
-- If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny.
+## Using Coqui
 
-## What are the requirements of a good 🐸TTS dataset?
-* {ref}`See this page <what_makes_a_good_dataset>`
+### Where does Coqui store downloaded models?
 
-## How should I choose the right model?
+The path to downloaded models is printed when running `tts --list_models`.
+Default locations are:
+
+- **Linux:** `~/.local/share/tts`
+- **Mac:** `~/Library/Application Support/tts`
+- **Windows:** `C:\Users\<user>\AppData\Local\tts`
+
+You can change the prefix of this `tts/` folder by setting the `XDG_DATA_HOME`
+or `TTS_HOME` environment variables.
+
+### Errors with a pre-trained model. How can I resolve this?
+- Make sure you use the latest version of 🐸TTS. Each pre-trained model is only
+  supported from a certain minimum version.
+- If it is still problematic, post your problem on
+  [Discussions](https://github.com/idiap/coqui-ai-TTS/discussions). Please give
+  as many details as possible (error message, your TTS version, your TTS model
+  and config.json etc.)
+- If you feel like it's a bug to be fixed, then prefer Github issues with the
+  same level of scrutiny.
+
+## Training Coqui models
+
+### What are the requirements of a good 🐸TTS dataset?
+- [See this page](datasets/what_makes_a_good_dataset.md)
+
+### How should I choose the right model?
 - First, train Tacotron. It is smaller and faster to experiment with. If it performs poorly, try Tacotron2.
 - Tacotron models produce the most natural voice if your dataset is not too noisy.
 - If both models do not perform well and especially the attention does not align, then try AlignTTS or GlowTTS.
 - If you need faster models, consider SpeedySpeech, GlowTTS or AlignTTS. Keep in mind that SpeedySpeech requires a pre-trained Tacotron or Tacotron2 model to compute text-to-speech alignments.
 
-## How can I train my own `tts` model?
+### How can I train my own `tts` model?
+
+```{note} XTTS has separate fine-tuning scripts, see [here](models/xtts.md#training).
+```
+
 0. Check your dataset with notebooks in [dataset_analysis](https://github.com/idiap/coqui-ai-TTS/tree/main/notebooks/dataset_analysis) folder. Use [this notebook](https://github.com/idiap/coqui-ai-TTS/blob/main/notebooks/dataset_analysis/CheckSpectrograms.ipynb) to find the right audio processing parameters. A better set of parameters results in a better audio synthesis.
 
-1. Write your own dataset `formatter` in `datasets/formatters.py` or format your dataset as one of the supported datasets, like LJSpeech.
+1. Write your own dataset `formatter` in `datasets/formatters.py` or [format](datasets/formatting_your_dataset) your dataset as one of the supported datasets, like LJSpeech.
     A `formatter` parses the metadata file and converts a list of training samples.
 
 2. If you have a dataset with a different alphabet than English, you need to set your own character list in the ```config.json```.
-    - If you use phonemes for training and your language is supported [here](https://github.com/rhasspy/gruut#supported-languages), you don't need to set your character list.
+    - If you use phonemes for training and your language is supported by
+    [Espeak](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)
+    or [Gruut](https://github.com/rhasspy/gruut#supported-languages), you don't need to set your character list.
     - You can use `TTS/bin/find_unique_chars.py` to get characters used in your dataset.
 
 3. Write your own text cleaner in ```utils.text.cleaners```. It is not always necessary, except when you have a different alphabet or language-specific requirements.
@@ -61,15 +89,16 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
     - SingleGPU training: ```CUDA_VISIBLE_DEVICES="0" python train_tts.py --config_path config.json```
     - MultiGPU training: ```python3 -m trainer.distribute --gpus "0,1" --script TTS/bin/train_tts.py --config_path config.json```
 
-**Note:** You can also train your model using pure 🐍 python. Check ```{eval-rst} :ref: 'tutorial_for_nervous_beginners'```.
+**Note:** You can also train your model using pure 🐍 python. Check the
+[tutorial](tutorial_for_nervous_beginners.md).
 
-## How can I train in a different language?
+### How can I train in a different language?
 - Check steps 2, 3, 4, 5 above.
 
-## How can I train multi-GPUs?
+### How can I train multi-GPUs?
 - Check step 5 above.
 
-## How can I check model performance?
+### How can I check model performance?
 - You can inspect model training and performance using ```tensorboard```. It will show you loss, attention alignment, model output. Go with the order below to measure the model performance.
 1. Check ground truth spectrograms. If they do not look as they are supposed to, then check audio processing parameters in ```config.json```.
 2. Check train and eval losses and make sure that they all decrease smoothly in time.
@@ -84,7 +113,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
         - 'bidirectional_decoder' is your ultimate savior, but it trains 2x slower and demands 1.5x more GPU memory.
     - You can also try the other models like AlignTTS or GlowTTS.
 
-## How do I know when to stop training?
+### How do I know when to stop training?
 There is no single objective metric to decide the end of a training since the voice quality is a subjective matter.
 
 In our model trainings, we follow these steps;
@@ -97,17 +126,17 @@ In our model trainings, we follow these steps;
 Keep in mind that the approach above only validates the model robustness. It is hard to estimate the voice quality without asking the actual people.
 The best approach is to pick a set of promising models and run a Mean-Opinion-Score study asking actual people to score the models.
 
-## My model does not learn. How can I debug?
+### My model does not learn. How can I debug?
 - Go over the steps under "How can I check model performance?"
 
-## Attention does not align. How can I make it work?
+### Attention does not align. How can I make it work?
 - Check the 4th step under "How can I check model performance?"
 
-## How can I test a trained model?
-- The best way is to use `tts` or `tts-server` commands. For details check {ref}`here <synthesizing_speech>`.
+### How can I test a trained model?
+- The best way is to use `tts` or `tts-server` commands. For details check [here](inference.md).
 - If you need to code your own ```TTS.utils.synthesizer.Synthesizer``` class.
 
-## My Tacotron model does not stop - I see "Decoder stopped with 'max_decoder_steps" - Stopnet does not work.
+### My Tacotron model does not stop - I see "Decoder stopped with 'max_decoder_steps" - Stopnet does not work.
 - In general, all of the above relates to the `stopnet`. It is the part of the model telling the `decoder` when to stop.
 - In general, a poor `stopnet` relates to something else that is broken in your model or dataset. Especially the attention module.
 - One common reason is the silent parts in the audio clips at the beginning and the ending. Check ```trim_db``` value in the config. You can find a better value for your dataset by using ```CheckSpectrogram``` notebook. If this value is too small, too much of the audio will be trimmed. If too big, then too much silence will remain. Both will curtail the `stopnet` performance.
diff --git a/docs/source/index.md b/docs/source/index.md
index 79993eec76..3a030b4f81 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -1,62 +1,63 @@
+---
+hide-toc: true
+---
 
 ```{include} ../../README.md
 :relative-images:
+:end-before: <!-- start installation -->
 ```
-----
-
-# Documentation Content
-```{eval-rst}
-.. toctree::
-    :maxdepth: 2
-    :caption: Get started
-
-    tutorial_for_nervous_beginners
-    installation
-    faq
-    contributing
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Using 🐸TTS
-
-    inference
-    docker_images
-    implementing_a_new_model
-    implementing_a_new_language_frontend
-    training_a_model
-    finetuning
-    configuration
-    formatting_your_dataset
-    what_makes_a_good_dataset
-    tts_datasets
-    marytts
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Main Classes
-
-    main_classes/trainer_api
-    main_classes/audio_processor
-    main_classes/model_api
-    main_classes/dataset
-    main_classes/gan
-    main_classes/speaker_manager
-
-.. toctree::
-    :maxdepth: 2
-    :caption: `tts` Models
-
-    models/glow_tts.md
-    models/vits.md
-    models/forward_tts.md
-    models/tacotron1-2.md
-    models/overflow.md
-    models/tortoise.md
-    models/bark.md
-    models/xtts.md
-
-.. toctree::
-    :maxdepth: 2
-    :caption: `vocoder` Models
 
+```{toctree}
+:maxdepth: 1
+:caption: Get started
+:hidden:
+
+tutorial_for_nervous_beginners
+installation
+docker_images
+faq
+project_structure
+contributing
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Using Coqui
+:hidden:
+
+inference
+training/index
+extension/index
+datasets/index
+```
+
+
+```{toctree}
+:maxdepth: 1
+:caption: Main Classes
+:hidden:
+
+configuration
+main_classes/trainer_api
+main_classes/audio_processor
+main_classes/model_api
+main_classes/dataset
+main_classes/gan
+main_classes/speaker_manager
+```
+
+
+```{toctree}
+:maxdepth: 1
+:caption: TTS Models
+:hidden:
+
+models/glow_tts.md
+models/vits.md
+models/forward_tts.md
+models/tacotron1-2.md
+models/overflow.md
+models/tortoise.md
+models/bark.md
+models/xtts.md
 ```
diff --git a/docs/source/inference.md b/docs/source/inference.md
index 4cb8f45a71..1bb844aee3 100644
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@@ -1,194 +1,22 @@
 (synthesizing_speech)=
-# Synthesizing Speech
+# Synthesizing speech
 
-First, you need to install TTS. We recommend using PyPi. You need to call the command below:
+## Overview
 
-```bash
-$ pip install coqui-tts
-```
-
-After the installation, 2 terminal commands are available.
-
-1. TTS Command Line Interface (CLI). - `tts`
-2. Local Demo Server. - `tts-server`
-3. In 🐍Python. - `from TTS.api import TTS`
-
-## On the Commandline - `tts`
-![cli.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/tts_cli.gif)
-
-After the installation, 🐸TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under 🐸TTS.
-
-Listing released 🐸TTS models.
-
-```bash
-tts --list_models
-```
-
-Run a TTS model, from the release models list, with its default vocoder. (Simply copy and paste the full model names from the list as arguments for the command below.)
-
-```bash
-tts --text "Text for TTS" \
-    --model_name "<type>/<language>/<dataset>/<model_name>" \
-    --out_path folder/to/save/output.wav
-```
-
-Run a tts and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model.
-
-```bash
-tts --text "Text for TTS" \
-    --model_name "tts_models/<language>/<dataset>/<model_name>" \
-    --vocoder_name "vocoder_models/<language>/<dataset>/<model_name>" \
-    --out_path folder/to/save/output.wav
-```
-
-Run your own TTS model (Using Griffin-Lim Vocoder)
-
-```bash
-tts --text "Text for TTS" \
-    --model_path path/to/model.pth \
-    --config_path path/to/config.json \
-    --out_path folder/to/save/output.wav
-```
-
-Run your own TTS and Vocoder models
-
-```bash
-tts --text "Text for TTS" \
-    --config_path path/to/config.json \
-    --model_path path/to/model.pth \
-    --out_path folder/to/save/output.wav \
-    --vocoder_path path/to/vocoder.pth \
-    --vocoder_config_path path/to/vocoder_config.json
-```
-
-Run a multi-speaker TTS model from the released models list.
-
-```bash
-tts --model_name "tts_models/<language>/<dataset>/<model_name>"  --list_speaker_idxs  # list the possible speaker IDs.
-tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "tts_models/<language>/<dataset>/<model_name>"  --speaker_idx "<speaker_id>"
-```
-
-Run a released voice conversion model
-
-```bash
-tts --model_name "voice_conversion/<language>/<dataset>/<model_name>"
-    --source_wav "my/source/speaker/audio.wav"
-    --target_wav "my/target/speaker/audio.wav"
-    --out_path folder/to/save/output.wav
-```
-
-**Note:** You can use ```./TTS/bin/synthesize.py``` if you prefer running ```tts``` from the TTS project folder.
-
-## On the Demo Server - `tts-server`
-
- <!-- <img src="https://raw.githubusercontent.com/idiap/coqui-ai-TTS/main/images/demo_server.gif" height="56"/> -->
-![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif)
-
-You can boot up a demo 🐸TTS server to run an inference with your models (make
-sure to install the additional dependencies with `pip install coqui-tts[server]`).
-Note that the server is not optimized for performance but gives you an easy way
-to interact with the models.
+Coqui TTS provides three main methods for inference:
 
-The demo server provides pretty much the same interface as the CLI command.
+1. 🐍Python API
+2. TTS command line interface (CLI)
+3. [Local demo server](server.md)
 
-```bash
-tts-server -h # see the help
-tts-server --list_models  # list the available models.
+```{include} ../../README.md
+:start-after: <!-- start inference -->
 ```
 
-Run a TTS model, from the release models list, with its default vocoder.
-If the model you choose is a multi-speaker TTS model, you can select different speakers on the Web interface and synthesize
-speech.
-
-```bash
-tts-server --model_name "<type>/<language>/<dataset>/<model_name>"
-```
-
-Run a TTS and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model.
-
-```bash
-tts-server --model_name "<type>/<language>/<dataset>/<model_name>" \
-           --vocoder_name "<type>/<language>/<dataset>/<model_name>"
-```
-
-## Python 🐸TTS API
-
-You can run a multi-speaker and multi-lingual model in Python as
-
-```python
-import torch
-from TTS.api import TTS
-
-# Get device
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-# List available 🐸TTS models
-print(TTS().list_models())
-
-# Init TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
-
-# Run TTS
-# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
-# Text to speech list of amplitude values as output
-wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
-# Text to speech to a file
-tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
-```
-
-#### Here is an example for a single speaker model.
-
-```python
-# Init TTS with the target model name
-tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False)
-# Run TTS
-tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
-```
-
-#### Example voice cloning with YourTTS in English, French and Portuguese:
-
-```python
-tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to("cuda")
-tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
-tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="output.wav")
-tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="output.wav")
-```
-
-#### Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
-
-```python
-tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda")
-tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
-```
-
-#### Example voice cloning by a single speaker TTS model combining with the voice conversion model.
-
-This way, you can clone voices by using any model in 🐸TTS.
-
-```python
-tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
-tts.tts_with_vc_to_file(
-    "Wie sage ich auf Italienisch, dass ich dich liebe?",
-    speaker_wav="target/speaker.wav",
-    file_path="ouptut.wav"
-)
-```
-
-#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
-For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
-
-You can find the list of language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
-
-```python
-from TTS.api import TTS
-api = TTS(model_name="tts_models/eng/fairseq/vits").to("cuda")
-api.tts_to_file("This is a test.", file_path="output.wav")
 
-# TTS with on the fly voice conversion
-api = TTS("tts_models/deu/fairseq/vits")
-api.tts_with_vc_to_file(
-    "Wie sage ich auf Italienisch, dass ich dich liebe?",
-    speaker_wav="target/speaker.wav",
-    file_path="ouptut.wav"
-)
+```{toctree}
+:hidden:
+vc
+server
+marytts
 ```
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 405c436643..1315395a59 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -1,40 +1,6 @@
 # Installation
 
-🐸TTS supports python >=3.9 <3.13.0 and was tested on Ubuntu 22.04.
-
-## Using `pip`
-
-`pip` is recommended if you want to use 🐸TTS only for inference.
-
-You can install from PyPI as follows:
-
-```bash
-pip install coqui-tts  # from PyPI
-```
-
-Or install from Github:
-
-```bash
-pip install git+https://github.com/idiap/coqui-ai-TTS  # from Github
+```{include} ../../README.md
+:start-after: <!-- start installation -->
+:end-before: <!-- end installation -->
 ```
-
-## Installing From Source
-
-This is recommended for development and more control over 🐸TTS.
-
-```bash
-git clone https://github.com/idiap/coqui-ai-TTS
-cd coqui-ai-TTS
-make system-deps  # only on Linux systems.
-
-# Install package and optional extras
-make install
-
-# Same as above + dev dependencies and pre-commit
-make install_dev
-```
-
-## On Windows
-If you are on Windows, 👑@GuyPaddock wrote installation instructions
-[here](https://stackoverflow.com/questions/66726331/) (note that these are out
-of date, e.g. you need to have at least Python 3.9)
diff --git a/docs/source/main_classes/model_api.md b/docs/source/main_classes/model_api.md
index 71b3d41640..bb7e9d1a1d 100644
--- a/docs/source/main_classes/model_api.md
+++ b/docs/source/main_classes/model_api.md
@@ -1,22 +1,22 @@
 # Model API
 Model API provides you a set of functions that easily make your model compatible with the `Trainer`,
-`Synthesizer` and `ModelZoo`.
+`Synthesizer` and the Coqui Python API.
 
-## Base TTS Model
+## Base Trainer Model
 
 ```{eval-rst}
 .. autoclass:: TTS.model.BaseTrainerModel
     :members:
 ```
 
-## Base tts Model
+## Base TTS Model
 
 ```{eval-rst}
 .. autoclass:: TTS.tts.models.base_tts.BaseTTS
     :members:
 ```
 
-## Base vocoder Model
+## Base Vocoder Model
 
 ```{eval-rst}
 .. autoclass:: TTS.vocoder.models.base_vocoder.BaseVocoder
diff --git a/docs/source/main_classes/trainer_api.md b/docs/source/main_classes/trainer_api.md
index 335294aa4d..bdb6048e45 100644
--- a/docs/source/main_classes/trainer_api.md
+++ b/docs/source/main_classes/trainer_api.md
@@ -1,3 +1,3 @@
 # Trainer API
 
-We made the trainer a separate project on https://github.com/eginhard/coqui-trainer
+We made the trainer a separate project: https://github.com/idiap/coqui-ai-Trainer
diff --git a/docs/source/marytts.md b/docs/source/marytts.md
index 9091ca330f..11cf4a2b9a 100644
--- a/docs/source/marytts.md
+++ b/docs/source/marytts.md
@@ -1,4 +1,4 @@
-# Mary-TTS API Support for Coqui-TTS
+# Mary-TTS API support for Coqui TTS
 
 ## What is Mary-TTS?
 
diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md
index 7c0f1c4a60..5f6c6ba44c 100644
--- a/docs/source/models/xtts.md
+++ b/docs/source/models/xtts.md
@@ -1,25 +1,25 @@
-# ⓍTTS
-ⓍTTS is a super cool Text-to-Speech model that lets you clone voices in different languages by using just a quick 3-second audio clip. Built on the 🐢Tortoise,
-ⓍTTS has important model changes that make cross-language voice cloning and multi-lingual speech generation super easy.
+# XTTS
+XTTS is a super cool Text-to-Speech model that lets you clone voices in different languages by using just a quick 3-second audio clip. Built on the 🐢Tortoise,
+XTTS has important model changes that make cross-language voice cloning and multi-lingual speech generation super easy.
 There is no need for an excessive amount of training data that spans countless hours.
 
-### Features
+## Features
 - Voice cloning.
 - Cross-language voice cloning.
 - Multi-lingual speech generation.
 - 24khz sampling rate.
-- Streaming inference with < 200ms latency. (See [Streaming inference](#streaming-inference))
+- Streaming inference with < 200ms latency. (See [Streaming inference](#streaming-manually))
 - Fine-tuning support. (See [Training](#training))
 
-### Updates with v2
+## Updates with v2
 - Improved voice cloning.
 - Voices can be cloned with a single audio file or multiple audio files, without any effect on the runtime.
 - Across the board quality improvements.
 
-### Code
+## Code
 Current implementation only supports inference and GPT encoder training.
 
-### Languages
+## Languages
 XTTS-v2 supports 17 languages:
 
 - Arabic (ar)
@@ -40,15 +40,15 @@ XTTS-v2 supports 17 languages:
 - Spanish (es)
 - Turkish (tr)
 
-### License
+## License
 This model is licensed under [Coqui Public Model License](https://coqui.ai/cpml).
 
-### Contact
+## Contact
 Come and join in our 🐸Community. We're active on [Discord](https://discord.gg/fBC58unbKE) and [Github](https://github.com/idiap/coqui-ai-TTS/discussions).
 
-### Inference
+## Inference
 
-#### 🐸TTS Command line
+### 🐸TTS Command line
 
 You can check all supported languages with the following command:
 
@@ -64,7 +64,7 @@ You can check all Coqui available speakers with the following command:
     --list_speaker_idx
 ```
 
-##### Coqui speakers
+#### Coqui speakers
 You can do inference using one of the available speakers using the following command:
 
 ```console
@@ -75,10 +75,10 @@ You can do inference using one of the available speakers using the following com
      --use_cuda
 ```
 
-##### Clone a voice
+#### Clone a voice
 You can clone a speaker voice using a single or multiple references:
 
-###### Single reference
+##### Single reference
 
 ```console
  tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
@@ -88,7 +88,7 @@ You can clone a speaker voice using a single or multiple references:
      --use_cuda
 ```
 
-###### Multiple references
+##### Multiple references
 ```console
  tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
      --text "Bugün okula gitmek istemiyorum." \
@@ -106,12 +106,12 @@ or for all wav files in a directory you can use:
      --use_cuda
 ```
 
-#### 🐸TTS API
+### 🐸TTS API
 
-##### Clone a voice
+#### Clone a voice
 You can clone a speaker voice using a single or multiple references:
 
-###### Single reference
+##### Single reference
 
 Splits the text into sentences and generates audio for each sentence. The audio files are then concatenated to produce the final audio.
 You can optionally disable sentence splitting for better coherence but more VRAM and possibly hitting models context length limit.
@@ -129,7 +129,7 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
                 )
 ```
 
-###### Multiple references
+##### Multiple references
 
 You can pass multiple audio files to the `speaker_wav` argument for better voice cloning.
 
@@ -154,7 +154,7 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
                 language="en")
 ```
 
-##### Coqui speakers
+#### Coqui speakers
 
 You can do inference using one of the available speakers using the following code:
 
@@ -163,28 +163,29 @@ from TTS.api import TTS
 tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
 
 # generate speech by cloning a voice using default settings
-tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
-                file_path="output.wav",
-                speaker="Ana Florence",
-                language="en",
-                split_sentences=True
-                )
+tts.tts_to_file(
+  text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+  file_path="output.wav",
+  speaker="Ana Florence",
+  language="en",
+  split_sentences=True
+)
 ```
 
 
-#### 🐸TTS Model API
+### 🐸TTS Model API
 
 To use the model API, you need to download the model files and pass config and model file paths manually.
 
-#### Manual Inference
+### Manual Inference
 
 If you want to be able to `load_checkpoint` with `use_deepspeed=True` and **enjoy the speedup**, you need to install deepspeed first.
 
 ```console
-pip install deepspeed==0.10.3
+pip install deepspeed
 ```
 
-##### inference parameters
+#### Inference parameters
 
 - `text`: The text to be synthesized.
 - `language`: The language of the text to be synthesized.
@@ -199,7 +200,7 @@ pip install deepspeed==0.10.3
 - `enable_text_splitting`: Whether to split the text into sentences and generate audio for each sentence. It allows you to have infinite input length but might loose important context between sentences. Defaults to True.
 
 
-##### Inference
+#### Inference
 
 
 ```python
@@ -230,8 +231,13 @@ out = model.inference(
 torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
 ```
 
+You can also use the Coqui speakers:
+
+```python
+gpt_cond_latent, speaker_embedding = model.speaker_manager.speakers["Ana Florence"].values()
+```
 
-##### Streaming manually
+#### Streaming manually
 
 Here the goal is to stream the audio as it is being generated. This is useful for real-time applications.
 Streaming inference is typically slower than regular inference, but it allows to get a first chunk of audio faster.
@@ -275,9 +281,9 @@ torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
 ```
 
 
-### Training
+## Training
 
-#### Easy training
+### Easy training
 To make `XTTS_v2` GPT encoder training easier for beginner users we did a gradio demo that implements the whole fine-tuning pipeline. The gradio demo enables the user to easily do the following steps:
 
 - Preprocessing of the uploaded audio or audio files in 🐸 TTS coqui formatter
@@ -286,7 +292,7 @@ To make `XTTS_v2` GPT encoder training easier for beginner users we did a gradio
 
 The user can run this gradio demo locally or remotely using a Colab Notebook.
 
-##### Run demo on Colab
+#### Run demo on Colab
 To make the `XTTS_v2` fine-tuning more accessible for users that do not have good GPUs available we did a Google Colab Notebook.
 
 The Colab Notebook is available [here](https://colab.research.google.com/drive/1GiI4_X724M8q2W-zZ-jXo7cWTV7RfaH-?usp=sharing).
@@ -302,7 +308,7 @@ If you are not able to acess the video you need to follow the steps:
 5. Soon the training is done you can go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. Then you can do the inference on the model by clicking on the button "Step 4 - Inference".
 
 
-##### Run demo locally
+#### Run demo locally
 
 To run the demo locally you need to do the following steps:
 1. Install   🐸 TTS following the instructions available [here](https://coqui-tts.readthedocs.io/en/latest/installation.html).
@@ -319,7 +325,7 @@ If you are not able to access the video, here is what you need to do:
 4. Go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded.
 5. Now you can run inference with the model by clicking on the button "Step 4 - Inference".
 
-#### Advanced training
+### Advanced training
 
 A recipe for `XTTS_v2` GPT encoder training using `LJSpeech` dataset is available at https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech/xtts_v1/train_gpt_xtts.py
 
@@ -393,6 +399,6 @@ torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000)
 
 ## XTTS Model
 ```{eval-rst}
-.. autoclass:: TTS.tts.models.xtts.XTTS
+.. autoclass:: TTS.tts.models.xtts.Xtts
     :members:
 ```
diff --git a/docs/source/project_structure.md b/docs/source/project_structure.md
new file mode 100644
index 0000000000..af3e472adc
--- /dev/null
+++ b/docs/source/project_structure.md
@@ -0,0 +1,30 @@
+# Project structure
+
+## Directory structure
+
+A non-comprehensive overview of the Coqui source code:
+
+| Directory | Contents |
+| - | - |
+| **Core** | |
+| **[`TTS/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS)** | Main source code |
+| **[`-   .models.json`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/.models.json)** | Pretrained model list |
+| **[`-   api.py`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/api.py)** | Python API |
+| **[`-   bin/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/bin)** | Executables and CLI |
+| **[`-   tts/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/tts)** | Text-to-speech models |
+| **[`-       configs/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/tts/configs)** | Model configurations |
+| **[`-       layers/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/tts/layers)** | Model layer definitions |
+| **[`-       models/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/tts/models)** | Model definitions |
+| **[`-   vc/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/vc)** | Voice conversion models |
+| `-       (same)` | |
+| **[`-   vocoder/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/vocoder)** | Vocoder models |
+| `-       (same)` | |
+| **[`-   encoder/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/encoder)** | Speaker encoder models |
+| `-       (same)` | |
+| **Recipes/notebooks** | |
+| **[`notebooks/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/notebooks)** | Jupyter Notebooks for model evaluation, parameter selection and data analysis |
+| **[`recipes/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes)** | Training recipes |
+| **Others** | |
+| **[`pyproject.toml`](https://github.com/idiap/coqui-ai-TTS/tree/dev/pyproject.toml)** | Project metadata, configuration and dependencies |
+| **[`docs/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/docs)** | Documentation |
+| **[`tests/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/tests)** | Unit and integration tests |
diff --git a/docs/source/server.md b/docs/source/server.md
new file mode 100644
index 0000000000..69bdace27b
--- /dev/null
+++ b/docs/source/server.md
@@ -0,0 +1,30 @@
+# Demo server
+
+![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif)
+
+You can boot up a demo 🐸TTS server to run an inference with your models (make
+sure to install the additional dependencies with `pip install coqui-tts[server]`).
+Note that the server is not optimized for performance.
+
+The demo server provides pretty much the same interface as the CLI command.
+
+```bash
+tts-server -h # see the help
+tts-server --list_models  # list the available models.
+```
+
+Run a TTS model, from the release models list, with its default vocoder.
+If the model you choose is a multi-speaker or multilingual TTS model, you can
+select different speakers and languages on the Web interface and synthesize
+speech.
+
+```bash
+tts-server --model_name "<type>/<language>/<dataset>/<model_name>"
+```
+
+Run a TTS and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model.
+
+```bash
+tts-server --model_name "<type>/<language>/<dataset>/<model_name>" \
+           --vocoder_name "<type>/<language>/<dataset>/<model_name>"
+```
diff --git a/docs/source/finetuning.md b/docs/source/training/finetuning.md
similarity index 91%
rename from docs/source/finetuning.md
rename to docs/source/training/finetuning.md
index 548e385ec7..fa2ed34a54 100644
--- a/docs/source/finetuning.md
+++ b/docs/source/training/finetuning.md
@@ -1,4 +1,4 @@
-# Fine-tuning a 🐸 TTS model
+# Fine-tuning a model
 
 ## Fine-tuning
 
@@ -21,17 +21,21 @@ them and fine-tune it for your own dataset. This will help you in two main ways:
     Fine-tuning comes to the rescue in this case. You can take one of our pre-trained models and fine-tune it on your own
     speech dataset and achieve reasonable results with only a couple of hours of data.
 
-    However, note that, fine-tuning does not ensure great results. The model performance still depends on the
-    {ref}`dataset quality <what_makes_a_good_dataset>` and the hyper-parameters you choose for fine-tuning. Therefore,
+    However, note that, fine-tuning does not ensure great results. The model
+    performance still depends on the [dataset quality](../datasets/what_makes_a_good_dataset.md)
+    and the hyper-parameters you choose for fine-tuning. Therefore,
     it still takes a bit of tinkering.
 
 
 ## Steps to fine-tune a 🐸 TTS model
 
+```{note} XTTS has separate fine-tuning scripts, see [here](../models/xtts.md#training).
+```
+
 1. Setup your dataset.
 
     You need to format your target dataset in a certain way so that 🐸TTS data loader will be able to load it for the
-    training. Please see {ref}`this page <formatting_your_dataset>` for more information about formatting.
+    training. Please see [this page](../datasets/formatting_your_dataset.md) for more information about formatting.
 
 2. Choose the model you want to fine-tune.
 
@@ -47,7 +51,8 @@ them and fine-tune it for your own dataset. This will help you in two main ways:
 
     You should choose the model based on your requirements. Some models are fast and some are better in speech quality.
     One lazy way to test a model is running the model on the hardware you want to use and see how it works. For
-    simple testing, you can use the `tts` command on the terminal. For more info see {ref}`here <synthesizing_speech>`.
+    simple testing, you can use the `tts` command on the terminal. For more info
+    see [here](../inference.md).
 
 3. Download the model.
 
diff --git a/docs/source/training/index.md b/docs/source/training/index.md
new file mode 100644
index 0000000000..b09f9cadcb
--- /dev/null
+++ b/docs/source/training/index.md
@@ -0,0 +1,13 @@
+# Training and fine-tuning
+
+The following pages show you how to train and fine-tune Coqui models:
+
+```{toctree}
+:maxdepth: 1
+
+training_a_model
+finetuning
+```
+
+Also see the [XTTS page](../models/xtts.md#training) if you want to fine-tune
+that model.
diff --git a/docs/source/training_a_model.md b/docs/source/training/training_a_model.md
similarity index 92%
rename from docs/source/training_a_model.md
rename to docs/source/training/training_a_model.md
index 989a57042a..22505ccb17 100644
--- a/docs/source/training_a_model.md
+++ b/docs/source/training/training_a_model.md
@@ -1,4 +1,4 @@
-# Training a Model
+# Training a model
 
 1. Decide the model you want to use.
 
@@ -11,11 +11,10 @@
 
 3. Check the recipes.
 
-    Recipes are located under `TTS/recipes/`. They do not promise perfect models but they provide a good start point for
-    `Nervous Beginners`.
+    Recipes are located under `TTS/recipes/`. They do not promise perfect models but they provide a good start point.
     A recipe for `GlowTTS` using `LJSpeech` dataset looks like below. Let's be creative and call this `train_glowtts.py`.
 
-    ```{literalinclude} ../../recipes/ljspeech/glow_tts/train_glowtts.py
+    ```{literalinclude} ../../../recipes/ljspeech/glow_tts/train_glowtts.py
     ```
 
     You need to change fields of the `BaseDatasetConfig` to match your dataset and then update `GlowTTSConfig`
@@ -113,7 +112,7 @@
 
     Note that different models have different metrics, visuals and outputs.
 
-    You should also check the [FAQ page](https://github.com/coqui-ai/TTS/wiki/FAQ) for common problems and solutions
+    You should also check the [FAQ page](../faq.md) for common problems and solutions
     that occur in a training.
 
 7. Use your best model for inference.
@@ -132,7 +131,7 @@
     In the example above, we trained a `GlowTTS` model, but the same workflow applies to all the other 🐸TTS models.
 
 
-# Multi-speaker Training
+## Multi-speaker Training
 
 Training a multi-speaker model is mostly the same as training a single-speaker model.
 You need to specify a couple of configuration parameters, initiate a `SpeakerManager` instance and pass it to the model.
@@ -142,5 +141,5 @@ d-vectors. For using d-vectors, you first need to compute the d-vectors using th
 
 The same Glow-TTS model above can be trained on a multi-speaker VCTK dataset with the script below.
 
-```{literalinclude} ../../recipes/vctk/glow_tts/train_glow_tts.py
+```{literalinclude} ../../../recipes/vctk/glow_tts/train_glow_tts.py
 ```
diff --git a/docs/source/tutorial_for_nervous_beginners.md b/docs/source/tutorial_for_nervous_beginners.md
index b417c4c45a..5e5eac0e0a 100644
--- a/docs/source/tutorial_for_nervous_beginners.md
+++ b/docs/source/tutorial_for_nervous_beginners.md
@@ -1,24 +1,40 @@
-# Tutorial For Nervous Beginners
+# Tutorial for nervous beginners
 
-## Installation
+First [install](installation.md) Coqui TTS.
 
-User friendly installation. Recommended only for synthesizing voice.
+## Synthesizing Speech
+
+You can run `tts` and synthesize speech directly on the terminal.
 
 ```bash
-$ pip install coqui-tts
+$ tts -h # see the help
+$ tts --list_models  # list the available models.
 ```
 
-Developer friendly installation.
+![cli.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/tts_cli.gif)
+
+
+You can call `tts-server` to start a local demo server that you can open on
+your favorite web browser and 🗣️ (make sure to install the additional
+dependencies with `pip install coqui-tts[server]`).
 
 ```bash
-$ git clone https://github.com/idiap/coqui-ai-TTS
-$ cd coqui-ai-TTS
-$ pip install -e .
+$ tts-server -h # see the help
+$ tts-server --list_models  # list the available models.
 ```
+![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif)
+
+See [this page](inference.md) for more details on synthesizing speech with the
+CLI, server or Python API.
 
 ## Training a `tts` Model
 
-A breakdown of a simple script that trains a GlowTTS model on the LJspeech dataset. See the comments for more details.
+```{note} XTTS has separate fine-tuning scripts, see [here](models/xtts.md#training).
+```
+
+A breakdown of a simple script that trains a GlowTTS model on the LJspeech
+dataset. For a more in-depth guide to training and fine-tuning also see [this
+page](training/index.md).
 
 ### Pure Python Way
 
@@ -99,25 +115,3 @@ We still support running training from CLI like in the old days. The same traini
 ```
 
 ❗️ Note that you can also use ```train_vocoder.py``` as the ```tts``` models above.
-
-## Synthesizing Speech
-
-You can run `tts` and synthesize speech directly on the terminal.
-
-```bash
-$ tts -h # see the help
-$ tts --list_models  # list the available models.
-```
-
-![cli.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/tts_cli.gif)
-
-
-You can call `tts-server` to start a local demo server that you can open on
-your favorite web browser and 🗣️ (make sure to install the additional
-dependencies with `pip install coqui-tts[server]`).
-
-```bash
-$ tts-server -h # see the help
-$ tts-server --list_models  # list the available models.
-```
-![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif)
diff --git a/docs/source/vc.md b/docs/source/vc.md
new file mode 100644
index 0000000000..8b45d9393a
--- /dev/null
+++ b/docs/source/vc.md
@@ -0,0 +1,84 @@
+# Voice conversion
+
+## Overview
+
+Voice conversion (VC) converts the voice in a speech signal from one speaker to
+that of another speaker while preserving the linguistic content. Coqui supports
+both voice conversion on its own, as well as applying it after speech synthesis
+to enable multi-speaker output with single-speaker TTS models.
+
+### Python API
+
+Converting the voice in `source_wav` to the voice of `target_wav` (the latter
+can also be a list of files):
+
+```python
+from TTS.api import TTS
+
+tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda")
+tts.voice_conversion_to_file(
+  source_wav="my/source.wav",
+  target_wav="my/target.wav",
+  file_path="output.wav"
+)
+```
+
+Voice cloning by combining TTS and VC. The FreeVC model is used for voice
+conversion after synthesizing speech.
+
+```python
+
+tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
+tts.tts_with_vc_to_file(
+  "Wie sage ich auf Italienisch, dass ich dich liebe?",
+  speaker_wav=["target1.wav", "target2.wav"],
+  file_path="output.wav"
+)
+```
+
+Some models, including [XTTS](models/xtts.md), support voice cloning directly
+and a separate voice conversion step is not necessary:
+
+```python
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
+wav = tts.tts(
+  text="Hello world!",
+  speaker_wav="my/cloning/audio.wav",
+  language="en"
+)
+```
+
+### CLI
+
+```sh
+tts --out_path output/path/speech.wav \
+    --model_name "<language>/<dataset>/<model_name>" \
+    --source_wav <path/to/speaker/wav> \
+    --target_wav <path/to/reference/wav1> <path/to/reference/wav2>
+```
+
+## Pretrained models
+
+Coqui includes the following pretrained voice conversion models. Training is not
+supported.
+
+### FreeVC
+
+- `voice_conversion_models/multilingual/vctk/freevc24`
+
+Adapted from: https://github.com/OlaWod/FreeVC
+
+### kNN-VC
+
+- `voice_conversion_models/multilingual/multi-dataset/knnvc`
+
+At least 1-5 minutes of target speaker data are recommended.
+
+Adapted from: https://github.com/bshall/knn-vc
+
+### OpenVoice
+
+- `voice_conversion_models/multilingual/multi-dataset/openvoice_v1`
+- `voice_conversion_models/multilingual/multi-dataset/openvoice_v2`
+
+Adapted from: https://github.com/myshell-ai/OpenVoice
diff --git a/hubconf.py b/hubconf.py
index 6e10928265..b49c9d6bcc 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,4 +1,14 @@
-dependencies = ["torch", "gdown", "pysbd", "gruut", "anyascii", "pypinyin", "coqpit", "mecab-python3", "unidic-lite"]
+dependencies = [
+    "torch",
+    "gdown",
+    "pysbd",
+    "gruut",
+    "anyascii",
+    "pypinyin",
+    "coqpit-config",
+    "mecab-python3",
+    "unidic-lite",
+]
 import torch
 
 from TTS.utils.manage import ModelManager
@@ -39,5 +49,5 @@ def tts(model_name="tts_models/en/ljspeech/tacotron2-DCA", vocoder_name=None, us
 
 
 if __name__ == "__main__":
-    synthesizer = torch.hub.load("coqui-ai/TTS:dev", "tts", source="github")
+    synthesizer = torch.hub.load("idiap/coqui-ai-TTS:dev", "tts", source="github")
     synthesizer.tts("This is a test!")
diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/plot_embeddings_umap.ipynb
similarity index 56%
rename from notebooks/PlotUmapLibriTTS.ipynb
rename to notebooks/plot_embeddings_umap.ipynb
index 1e29790b9e..b661f85673 100644
--- a/notebooks/PlotUmapLibriTTS.ipynb
+++ b/notebooks/plot_embeddings_umap.ipynb
@@ -4,13 +4,20 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Overview\n",
+    "# Overview\n",
     "\n",
     "This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n",
     "\n",
     "Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -19,63 +26,47 @@
    "source": [
     "import os\n",
     "import glob\n",
+    "import random\n",
+    "from collections import defaultdict\n",
+    "from pathlib import Path\n",
+    "\n",
     "import numpy as np\n",
+    "import torch\n",
     "import umap\n",
     "\n",
-    "from TTS.utils.audio import AudioProcessor\n",
+    "from TTS.bin.compute_embeddings import compute_embeddings\n",
     "from TTS.config import load_config\n",
+    "from TTS.config.shared_configs import BaseDatasetConfig\n",
+    "from TTS.tts.datasets import load_tts_samples\n",
+    "from TTS.utils.audio import AudioProcessor\n",
     "\n",
     "from bokeh.io import output_notebook, show\n",
     "from bokeh.plotting import figure\n",
     "from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n",
     "from bokeh.transform import factor_cmap\n",
-    "from bokeh.palettes import Category10"
+    "from bokeh.palettes import Category10\n",
+    "\n",
+    "output_notebook()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n",
+    "For larger sets of speakers, you can use `Category20`, but you need to change it in the `pal` variable too\n",
     "\n",
-    "List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n",
+    "List of Bokeh palettes here: https://docs.bokeh.org/en/latest/docs/reference/palettes.html\n",
     "\n",
     "**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "output_notebook()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You should also adjust all the path constants to point at the relevant locations for you locally"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
-    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n",
-    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
-    "\n",
-    "# My single speaker locations\n",
-    "#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n",
-    "#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n",
+    "## Config\n",
     "\n",
-    "# My multi speaker locations\n",
-    "EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n",
-    "AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\""
+    "You should adjust all the paths to point at the relevant locations for you locally."
    ]
   },
   {
@@ -84,7 +75,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!ls -1 $MODEL_RUN_PATH"
+    "# Dataset\n",
+    "formatter_name = \"ljspeech\"\n",
+    "dataset_name = \"ljspeech\"\n",
+    "dataset_path = \"path/to/LJSpeech-1.1\"\n",
+    "meta_file_train = \"metadata.csv\"\n",
+    "\n",
+    "# Speaker encoder\n",
+    "se_model_path = \"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar\"\n",
+    "se_config_path = \"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json\"\n",
+    "embedding_path = \"speakers.pth\""
    ]
   },
   {
@@ -93,15 +93,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "CONFIG = load_config(CONFIG_PATH)\n",
-    "ap = AudioProcessor(**CONFIG['audio'])"
+    "dataset_config = BaseDatasetConfig()\n",
+    "dataset_config.formatter = formatter_name\n",
+    "dataset_config.dataset_name = dataset_name\n",
+    "dataset_config.path = dataset_path\n",
+    "dataset_config.meta_file_train = meta_file_train\n",
+    "\n",
+    "meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=False)\n",
+    "utt_to_wav = {\n",
+    "    item[\"audio_unique_name\"]: str(Path(item[\"audio_file\"]).relative_to(dataset_path)) for item in meta_data_train\n",
+    "}"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Bring in the embeddings created by **compute_embeddings.py**"
+    "## Compute embeddings\n",
+    "\n",
+    "You can skip this if you have already computed embeddings with `TTS/bin/compute_embeddings.py`"
    ]
   },
   {
@@ -110,33 +120,38 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n",
-    "print(f'Embeddings found: {len(embed_files)}')"
+    "compute_embeddings(\n",
+    "    model_path=se_model_path,\n",
+    "    config_path=se_config_path,\n",
+    "    output_path=embedding_path,\n",
+    "    formatter_name=formatter_name,\n",
+    "    dataset_name=dataset_name,\n",
+    "    dataset_path=dataset_path,\n",
+    "    meta_file_train=meta_file_train,\n",
+    ")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Check that we did indeed find an embedding"
+    "## Plot Umap"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "embed_files[0]"
+    "Bring in the embeddings created by `TTS/bin/compute_embeddings.py`"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### Process the speakers\n",
-    "\n",
-    "Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)"
+    "embeddings = torch.load(embedding_path, weights_only=True)"
    ]
   },
   {
@@ -145,15 +160,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n",
-    "speaker_to_utter = {}\n",
-    "for embed_file in embed_files:\n",
-    "    speaker_path = os.path.dirname(os.path.dirname(embed_file))\n",
-    "    try:\n",
-    "        speaker_to_utter[speaker_path].append(embed_file)\n",
-    "    except:\n",
-    "        speaker_to_utter[speaker_path]=[embed_file]\n",
-    "print(f'Speaker count: {len(speaker_paths)}')"
+    "speakers = set()\n",
+    "speaker_to_utter = defaultdict(list)\n",
+    "for idx, embedding in embeddings.items():\n",
+    "    speaker = embedding[\"name\"]\n",
+    "    speakers.add(speaker)\n",
+    "    speaker_to_utter[speaker].append(idx)\n",
+    "print(f\"Speaker count: {len(speakers)}\")"
    ]
   },
   {
@@ -175,35 +188,32 @@
     "labels = []\n",
     "locations = []\n",
     "\n",
-    "# single speaker \n",
-    "#num_speakers = 1\n",
-    "#num_utters = 1000\n",
+    "# single speaker\n",
+    "num_speakers = 1\n",
+    "num_utters = 1000\n",
     "\n",
     "# multi speaker\n",
-    "num_speakers = 10\n",
-    "num_utters = 20\n",
+    "# num_speakers = 10\n",
+    "# num_utters = 20\n",
     "\n",
-    "\n",
-    "speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n",
+    "speaker_idxs = random.sample(list(speakers), num_speakers)\n",
     "\n",
     "for speaker_num, speaker_idx in enumerate(speaker_idxs):\n",
-    "    speaker_path = speaker_paths[speaker_idx]\n",
-    "    speakers_utter = speaker_to_utter[speaker_path]\n",
-    "    utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n",
+    "    speakers_utter = speaker_to_utter[speaker_idx]\n",
+    "    utter_idxs = random.sample(speakers_utter, num_utters)\n",
     "    for utter_idx in utter_idxs:\n",
-    "            embed_path = speaker_to_utter[speaker_path][utter_idx]\n",
-    "            embed = np.load(embed_path)\n",
-    "            embeds.append(embed)\n",
-    "            labels.append(str(speaker_num))\n",
-    "            locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n",
-    "embeds = np.concatenate(embeds)"
+    "        embed = np.array(embeddings[utter_idx][\"embedding\"])\n",
+    "        embeds.append(embed)\n",
+    "        labels.append(speaker_idx)\n",
+    "        locations.append(utt_to_wav[utter_idx])\n",
+    "embeds = np.stack(embeds)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Load embeddings with UMAP"
+    "### Load embeddings with UMAP"
    ]
   },
   {
@@ -222,9 +232,7 @@
    "source": [
     "### Interactively charting the data in Bokeh\n",
     "\n",
-    "Set up various details for Bokeh to plot the data\n",
-    "\n",
-    "You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n",
+    "You can use the regular Bokeh [tools](https://docs.bokeh.org/en/latest/docs/user_guide/interaction/tools.html) to explore the data, with reset setting it back to normal\n",
     "\n",
     "Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n",
     "\n",
@@ -238,22 +246,17 @@
    "outputs": [],
    "source": [
     "source_wav_stems = ColumnDataSource(\n",
-    "        data=dict(\n",
-    "            x = projection.T[0].tolist(),\n",
-    "            y = projection.T[1].tolist(),\n",
-    "            desc=locations,\n",
-    "            label=labels\n",
-    "        )\n",
+    "    data=dict(\n",
+    "        x=projection.T[0].tolist(),\n",
+    "        y=projection.T[1].tolist(),\n",
+    "        desc=locations,\n",
+    "        label=labels,\n",
     "    )\n",
+    ")\n",
     "\n",
-    "hover = HoverTool(\n",
-    "        tooltips=[\n",
-    "            (\"file\", \"@desc\"),\n",
-    "            (\"speaker\", \"@label\"),\n",
-    "        ]\n",
-    "    )\n",
+    "hover = HoverTool(tooltips=[(\"file\", \"@desc\"), (\"speaker\", \"@label\")])\n",
     "\n",
-    "# optionally consider adding these to the tooltips if you want additional detail\n",
+    "### Optionally consider adding these to the tooltips if you want additional detail\n",
     "# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n",
     "# for the index of the embedding / wav file: (\"index\", \"$index\"),\n",
     "\n",
@@ -261,10 +264,13 @@
     "pal_size = max(len(factors), 3)\n",
     "pal = Category10[pal_size]\n",
     "\n",
-    "p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n",
-    "\n",
-    "\n",
-    "p.circle('x', 'y',  source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n",
+    "p = figure(width=600, height=400, tools=[hover, BoxZoomTool(), ResetTool(), TapTool()])\n",
+    "p.scatter(\n",
+    "    \"x\",\n",
+    "    \"y\",\n",
+    "    source=source_wav_stems,\n",
+    "    color=factor_cmap(\"label\", palette=pal, factors=factors),\n",
+    ")\n",
     "\n",
     "url = \"http://localhost:8000/@desc\"\n",
     "taptool = p.select(type=TapTool)\n",
@@ -292,7 +298,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%cd $AUDIO_PATH\n",
+    "%cd $dataset_path\n",
     "%pwd\n",
     "!python -m http.server"
    ]
@@ -300,7 +306,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -314,7 +320,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index bf0a1d88c2..44c5fb7127 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "coqui-tts"
-version = "0.25.1"
+version = "0.25.2"
 description = "Deep learning for Text to Speech."
 readme = "README.md"
 requires-python = ">=3.9, <3.13"
@@ -70,6 +70,7 @@ dependencies = [
     "pyyaml>=6.0",
     "fsspec[http]>=2023.6.0",
     "packaging>=23.1",
+    "typing_extensions>=4.10",
     # Inference
     "pysbd>=0.3.4",
     # Training
@@ -86,14 +87,14 @@ dependencies = [
     # Bark
     "encodec>=0.1.1",
     # XTTS
-    "num2words>=0.5.11",
+    "num2words>=0.5.14",
     "spacy[ja]>=3,<3.8",
 ]
 
 [project.optional-dependencies]
 # Only used in notebooks
 notebooks = [
-    "bokeh==1.4.0",
+    "bokeh>=3.0.3",
     "pandas>=1.4,<2.0",
     "umap-learn>=0.5.1",
 ]
@@ -137,18 +138,18 @@ all = [
 dev = [
     "black==24.2.0",
     "coverage[toml]>=7",
-    "nose2>=0.15",
     "pre-commit>=3",
+    "pytest>=8",
     "ruff==0.7.0",
 ]
 # Dependencies for building the documentation
 docs = [
-    "furo>=2023.5.20",
-    "myst-parser==2.0.0",
-    "sphinx==7.2.5",
+    "furo>=2024.8.6",
+    "myst-parser==3.0.1",
+    "sphinx==7.4.7",
     "sphinx_inline_tabs>=2023.4.21",
-    "sphinx_copybutton>=0.1",
-    "linkify-it-py>=2.0.0",
+    "sphinx_copybutton>=0.5.2",
+    "linkify-it-py>=2.0.3",
 ]
 
 [project.urls]
@@ -173,7 +174,6 @@ exclude = [
     "/.readthedocs.yml",
     "/Makefile",
     "/dockerfiles",
-    "/run_bash_tests.sh",
     "/scripts",
     "/tests",
 ]
@@ -235,10 +235,10 @@ max-returns = 7
 line-length = 120
 target-version = ['py39']
 
+[tool.coverage.report]
+skip_covered = true
+skip_empty = true
+
 [tool.coverage.run]
 parallel = true
 source = ["TTS"]
-
-[tool.cibuildwheel]
-build = "cp*"
-skip = "*-win32 *i686 *musllinux*"
diff --git a/run_bash_tests.sh b/run_bash_tests.sh
deleted file mode 100755
index 2f5ba88934..0000000000
--- a/run_bash_tests.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-set -e
-TF_CPP_MIN_LOG_LEVEL=3
-
-# runtime bash based tests
-# TODO: move these to python
-./tests/bash_tests/test_demo_server.sh && \
-./tests/bash_tests/test_compute_statistics.sh
diff --git a/scripts/sync_readme.py b/scripts/sync_readme.py
index 584286814b..97256bca6d 100644
--- a/scripts/sync_readme.py
+++ b/scripts/sync_readme.py
@@ -22,8 +22,12 @@ def sync_readme():
     new_content = replace_between_markers(orig_content, "tts-readme", description.strip())
     if args.check:
         if orig_content != new_content:
-            print("README.md is out of sync; please edit TTS/bin/TTS_README.md and run scripts/sync_readme.py")
+            print(
+                "README.md is out of sync; please reconcile README.md and TTS/bin/synthesize.py and run scripts/sync_readme.py"
+            )
             exit(42)
+        print("All good, files in sync")
+        exit(0)
     readme_path.write_text(new_content)
     print("Updated README.md")
 
diff --git a/tests/__init__.py b/tests/__init__.py
index f0a8b2f118..8108bdeb50 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,5 +1,7 @@
 import os
+from typing import Callable, Optional
 
+import pytest
 from trainer.generic_utils import get_cuda
 
 from TTS.config import BaseDatasetConfig
@@ -44,6 +46,12 @@ def run_cli(command):
     assert exit_status == 0, f" [!] command `{command}` failed."
 
 
+def run_main(main_func: Callable, args: Optional[list[str]] = None, expected_code: int = 0):
+    with pytest.raises(SystemExit) as exc_info:
+        main_func(args)
+    assert exc_info.value.code == expected_code
+
+
 def get_test_data_config():
     return BaseDatasetConfig(formatter="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv")
 
diff --git a/tests/aux_tests/test_audio_processor.py b/tests/aux_tests/test_audio_processor.py
index 5b1fa9d38a..6caf6db30d 100644
--- a/tests/aux_tests/test_audio_processor.py
+++ b/tests/aux_tests/test_audio_processor.py
@@ -1,190 +1,194 @@
 import os
-import unittest
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+import pytest
+
+from tests import get_tests_input_path
 from TTS.config import BaseAudioConfig
 from TTS.utils.audio.processor import AudioProcessor
 
-TESTS_PATH = get_tests_path()
-OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
-os.makedirs(OUT_PATH, exist_ok=True)
 conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1)
 
 
-# pylint: disable=protected-access
-class TestAudio(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.ap = AudioProcessor(**conf)
-
-    def test_audio_synthesis(self):
-        """1. load wav
-        2. set normalization parameters
-        3. extract mel-spec
-        4. invert to wav and save the output
-        """
-        print(" > Sanity check for the process wav -> mel -> wav")
-
-        def _test(max_norm, signal_norm, symmetric_norm, clip_norm):
-            self.ap.max_norm = max_norm
-            self.ap.signal_norm = signal_norm
-            self.ap.symmetric_norm = symmetric_norm
-            self.ap.clip_norm = clip_norm
-            wav = self.ap.load_wav(WAV_FILE)
-            mel = self.ap.melspectrogram(wav)
-            wav_ = self.ap.inv_melspectrogram(mel)
-            file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav".format(
-                max_norm, signal_norm, symmetric_norm, clip_norm
-            )
-            print(" | > Creating wav file at : ", file_name)
-            self.ap.save_wav(wav_, OUT_PATH + file_name)
-
-        # maxnorm = 1.0
-        _test(1.0, False, False, False)
-        _test(1.0, True, False, False)
-        _test(1.0, True, True, False)
-        _test(1.0, True, False, True)
-        _test(1.0, True, True, True)
-        # maxnorm = 4.0
-        _test(4.0, False, False, False)
-        _test(4.0, True, False, False)
-        _test(4.0, True, True, False)
-        _test(4.0, True, False, True)
-        _test(4.0, True, True, True)
-
-    def test_normalize(self):
-        """Check normalization and denormalization for range values and consistency"""
-        print(" > Testing normalization and denormalization.")
-        wav = self.ap.load_wav(WAV_FILE)
-        wav = self.ap.sound_norm(wav)  # normalize audio to get abetter normalization range below.
-        self.ap.signal_norm = False
-        x = self.ap.melspectrogram(wav)
-        x_old = x
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = False
-        self.ap.clip_norm = False
-        self.ap.max_norm = 4.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-        assert (x_old - x).sum() == 0
-        # check value range
-        assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
-        assert x_norm.min() >= 0 - 1, x_norm.min()
-        # check denorm.
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3, (x - x_).mean()
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = False
-        self.ap.clip_norm = True
-        self.ap.max_norm = 4.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        # check value range
-        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
-        assert x_norm.min() >= 0, x_norm.min()
-        # check denorm.
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3, (x - x_).mean()
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = True
-        self.ap.clip_norm = False
-        self.ap.max_norm = 4.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        # check value range
-        assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
-        assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min()  # pylint: disable=invalid-unary-operand-type
-        assert x_norm.min() <= 0, x_norm.min()
-        # check denorm.
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3, (x - x_).mean()
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = True
-        self.ap.clip_norm = True
-        self.ap.max_norm = 4.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        # check value range
-        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
-        assert x_norm.min() >= -self.ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
-        assert x_norm.min() <= 0, x_norm.min()
-        # check denorm.
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3, (x - x_).mean()
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = False
-        self.ap.max_norm = 1.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
-        assert x_norm.min() >= 0, x_norm.min()
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3
-
-        self.ap.signal_norm = True
-        self.ap.symmetric_norm = True
-        self.ap.max_norm = 1.0
-        x_norm = self.ap.normalize(x)
-        print(
-            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
-        )
-
-        assert (x_old - x).sum() == 0
-        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
-        assert x_norm.min() >= -self.ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
-        assert x_norm.min() < 0, x_norm.min()
-        x_ = self.ap.denormalize(x_norm)
-        assert (x - x_).sum() < 1e-3
-
-    def test_scaler(self):
-        scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy")
-        conf.stats_path = scaler_stats_path
-        conf.preemphasis = 0.0
-        conf.do_trim_silence = True
-        conf.signal_norm = True
-
-        ap = AudioProcessor(**conf)
-        mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path)
-        ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
-
-        self.ap.signal_norm = False
-        self.ap.preemphasis = 0.0
-
-        # test scaler forward and backward transforms
-        wav = self.ap.load_wav(WAV_FILE)
-        mel_reference = self.ap.melspectrogram(wav)
-        mel_norm = ap.melspectrogram(wav)
-        mel_denorm = ap.denormalize(mel_norm)
-        assert abs(mel_reference - mel_denorm).max() < 1e-4
-
-    def test_compute_f0(self):  # pylint: disable=no-self-use
-        ap = AudioProcessor(**conf)
-        wav = ap.load_wav(WAV_FILE)
-        pitch = ap.compute_f0(wav)
-        mel = ap.melspectrogram(wav)
-        assert pitch.shape[0] == mel.shape[1]
+@pytest.fixture
+def ap():
+    """Set up audio processor."""
+    return AudioProcessor(**conf)
+
+
+norms = [
+    # maxnorm = 1.0
+    (1.0, False, False, False),
+    (1.0, True, False, False),
+    (1.0, True, True, False),
+    (1.0, True, False, True),
+    (1.0, True, True, True),
+    # maxnorm = 4.0
+    (4.0, False, False, False),
+    (4.0, True, False, False),
+    (4.0, True, True, False),
+    (4.0, True, False, True),
+    (4.0, True, True, True),
+]
+
+
+@pytest.mark.parametrize("norms", norms)
+def test_audio_synthesis(tmp_path, ap, norms):
+    """1. load wav
+    2. set normalization parameters
+    3. extract mel-spec
+    4. invert to wav and save the output
+    """
+    print(" > Sanity check for the process wav -> mel -> wav")
+    max_norm, signal_norm, symmetric_norm, clip_norm = norms
+    ap.max_norm = max_norm
+    ap.signal_norm = signal_norm
+    ap.symmetric_norm = symmetric_norm
+    ap.clip_norm = clip_norm
+    wav = ap.load_wav(WAV_FILE)
+    mel = ap.melspectrogram(wav)
+    wav_ = ap.inv_melspectrogram(mel)
+    file_name = (
+        f"audio_test-melspec_max_norm_{max_norm}-signal_norm_{signal_norm}-"
+        f"symmetric_{symmetric_norm}-clip_norm_{clip_norm}.wav"
+    )
+    print(" | > Creating wav file at : ", file_name)
+    ap.save_wav(wav_, tmp_path / file_name)
+
+
+def test_normalize(ap):
+    """Check normalization and denormalization for range values and consistency"""
+    print(" > Testing normalization and denormalization.")
+    wav = ap.load_wav(WAV_FILE)
+    wav = ap.sound_norm(wav)  # normalize audio to get abetter normalization range below.
+    ap.signal_norm = False
+    x = ap.melspectrogram(wav)
+    x_old = x
+
+    ap.signal_norm = True
+    ap.symmetric_norm = False
+    ap.clip_norm = False
+    ap.max_norm = 4.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+    assert (x_old - x).sum() == 0
+    # check value range
+    assert x_norm.max() <= ap.max_norm + 1, x_norm.max()
+    assert x_norm.min() >= 0 - 1, x_norm.min()
+    # check denorm.
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3, (x - x_).mean()
+
+    ap.signal_norm = True
+    ap.symmetric_norm = False
+    ap.clip_norm = True
+    ap.max_norm = 4.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    # check value range
+    assert x_norm.max() <= ap.max_norm, x_norm.max()
+    assert x_norm.min() >= 0, x_norm.min()
+    # check denorm.
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3, (x - x_).mean()
+
+    ap.signal_norm = True
+    ap.symmetric_norm = True
+    ap.clip_norm = False
+    ap.max_norm = 4.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    # check value range
+    assert x_norm.max() <= ap.max_norm + 1, x_norm.max()
+    assert x_norm.min() >= -ap.max_norm - 2, x_norm.min()  # pylint: disable=invalid-unary-operand-type
+    assert x_norm.min() <= 0, x_norm.min()
+    # check denorm.
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3, (x - x_).mean()
+
+    ap.signal_norm = True
+    ap.symmetric_norm = True
+    ap.clip_norm = True
+    ap.max_norm = 4.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    # check value range
+    assert x_norm.max() <= ap.max_norm, x_norm.max()
+    assert x_norm.min() >= -ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
+    assert x_norm.min() <= 0, x_norm.min()
+    # check denorm.
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3, (x - x_).mean()
+
+    ap.signal_norm = True
+    ap.symmetric_norm = False
+    ap.max_norm = 1.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    assert x_norm.max() <= ap.max_norm, x_norm.max()
+    assert x_norm.min() >= 0, x_norm.min()
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3
+
+    ap.signal_norm = True
+    ap.symmetric_norm = True
+    ap.max_norm = 1.0
+    x_norm = ap.normalize(x)
+    print(
+        f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
+    )
+
+    assert (x_old - x).sum() == 0
+    assert x_norm.max() <= ap.max_norm, x_norm.max()
+    assert x_norm.min() >= -ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
+    assert x_norm.min() < 0, x_norm.min()
+    x_ = ap.denormalize(x_norm)
+    assert (x - x_).sum() < 1e-3
+
+
+def test_scaler(ap):
+    scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy")
+    conf.stats_path = scaler_stats_path
+    conf.preemphasis = 0.0
+    conf.do_trim_silence = True
+    conf.signal_norm = True
+
+    ap = AudioProcessor(**conf)
+    mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path)
+    ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
+
+    ap.signal_norm = False
+    ap.preemphasis = 0.0
+
+    # test scaler forward and backward transforms
+    wav = ap.load_wav(WAV_FILE)
+    mel_reference = ap.melspectrogram(wav)
+    mel_norm = ap.melspectrogram(wav)
+    mel_denorm = ap.denormalize(mel_norm)
+    assert abs(mel_reference - mel_denorm).max() < 1e-4
+
+
+def test_compute_f0(ap):
+    wav = ap.load_wav(WAV_FILE)
+    pitch = ap.compute_f0(wav)
+    mel = ap.melspectrogram(wav)
+    assert pitch.shape[0] == mel.shape[1]
diff --git a/tests/aux_tests/test_compute_statistics.py b/tests/aux_tests/test_compute_statistics.py
new file mode 100644
index 0000000000..d6809eb480
--- /dev/null
+++ b/tests/aux_tests/test_compute_statistics.py
@@ -0,0 +1,10 @@
+from pathlib import Path
+
+from tests import get_tests_input_path, run_main
+from TTS.bin.compute_statistics import main
+
+
+def test_compute_statistics(tmp_path):
+    config_path = Path(get_tests_input_path()) / "test_glow_tts_config.json"
+    output_path = tmp_path / "scale_stats.npy"
+    run_main(main, ["--config_path", str(config_path), "--out_path", str(output_path)])
diff --git a/tests/aux_tests/test_extract_tts_spectrograms.py b/tests/aux_tests/test_extract_tts_spectrograms.py
index f2d119ac35..563c5dae02 100644
--- a/tests/aux_tests/test_extract_tts_spectrograms.py
+++ b/tests/aux_tests/test_extract_tts_spectrograms.py
@@ -1,67 +1,23 @@
-import os
-import unittest
+from pathlib import Path
 
+import pytest
 import torch
 
-from tests import get_tests_input_path, get_tests_output_path, run_cli
+from tests import get_tests_input_path, run_main
+from TTS.bin.extract_tts_spectrograms import main
 from TTS.config import load_config
 from TTS.tts.models import setup_model
 
 torch.manual_seed(1)
 
 
-# pylint: disable=protected-access
-class TestExtractTTSSpectrograms(unittest.TestCase):
-    @staticmethod
-    def test_GlowTTS():
-        # set paths
-        config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "glowtts.pth")
-        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
-        # load config
-        c = load_config(config_path)
-        # create model
-        model = setup_model(c)
-        # save model
-        torch.save({"model": model.state_dict()}, checkpoint_path)
-        # run test
-        run_cli(
-            f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"'
-        )
-        run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
+@pytest.mark.parametrize("model", ["glow_tts", "tacotron", "tacotron2"])
+def test_extract_tts_spectrograms(tmp_path, model):
+    config_path = str(Path(get_tests_input_path()) / f"test_{model}_config.json")
+    checkpoint_path = str(tmp_path / f"{model}.pth")
+    output_path = str(tmp_path / "output_extract_tts_spectrograms")
 
-    @staticmethod
-    def test_Tacotron2():
-        # set paths
-        config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "tacotron2.pth")
-        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
-        # load config
-        c = load_config(config_path)
-        # create model
-        model = setup_model(c)
-        # save model
-        torch.save({"model": model.state_dict()}, checkpoint_path)
-        # run test
-        run_cli(
-            f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"'
-        )
-        run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
-
-    @staticmethod
-    def test_Tacotron():
-        # set paths
-        config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "tacotron.pth")
-        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
-        # load config
-        c = load_config(config_path)
-        # create model
-        model = setup_model(c)
-        # save model
-        torch.save({"model": model.state_dict()}, checkpoint_path)
-        # run test
-        run_cli(
-            f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"'
-        )
-        run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"')
+    config = load_config(config_path)
+    model = setup_model(config)
+    torch.save({"model": model.state_dict()}, checkpoint_path)
+    run_main(main, ["--config_path", config_path, "--checkpoint_path", checkpoint_path, "--output_path", output_path])
diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py
index 018679f573..53298cdebd 100644
--- a/tests/aux_tests/test_find_unique_phonemes.py
+++ b/tests/aux_tests/test_find_unique_phonemes.py
@@ -1,16 +1,12 @@
-import os
-import unittest
-
 import torch
 
-from tests import get_tests_output_path, run_cli
+from tests import run_main
+from TTS.bin.find_unique_phonemes import main
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 
 torch.manual_seed(1)
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-
 dataset_config_en = BaseDatasetConfig(
     formatter="ljspeech",
     meta_file_train="metadata.csv",
@@ -30,52 +26,26 @@
 """
 
 
-# pylint: disable=protected-access
-class TestFindUniquePhonemes(unittest.TestCase):
-    @staticmethod
-    def test_espeak_phonemes():
-        # prepare the config
-        config = VitsConfig(
-            batch_size=2,
-            eval_batch_size=2,
-            num_loader_workers=0,
-            num_eval_loader_workers=0,
-            text_cleaner="english_cleaners",
-            use_phonemes=True,
-            phoneme_language="en-us",
-            phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-            run_eval=True,
-            test_delay_epochs=-1,
-            epochs=1,
-            print_step=1,
-            print_eval=True,
-            datasets=[dataset_config_en],
-        )
-        config.save_json(config_path)
-
-        # run test
-        run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"')
-
-    @staticmethod
-    def test_no_espeak_phonemes():
-        # prepare the config
-        config = VitsConfig(
-            batch_size=2,
-            eval_batch_size=2,
-            num_loader_workers=0,
-            num_eval_loader_workers=0,
-            text_cleaner="english_cleaners",
-            use_phonemes=True,
-            phoneme_language="en-us",
-            phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-            run_eval=True,
-            test_delay_epochs=-1,
-            epochs=1,
-            print_step=1,
-            print_eval=True,
-            datasets=[dataset_config_en],
-        )
-        config.save_json(config_path)
-
-        # run test
-        run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"')
+def test_find_phonemes(tmp_path):
+    # prepare the config
+    config_path = str(tmp_path / "test_model_config.json")
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        datasets=[dataset_config_en],
+    )
+    config.save_json(config_path)
+
+    # run test
+    run_main(main, ["--config_path", config_path])
diff --git a/tests/aux_tests/test_numpy_transforms.py b/tests/aux_tests/test_numpy_transforms.py
index 00597a0f88..129ba5d86b 100644
--- a/tests/aux_tests/test_numpy_transforms.py
+++ b/tests/aux_tests/test_numpy_transforms.py
@@ -7,18 +7,12 @@
 import numpy as np
 from coqpit import Coqpit
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+from tests import get_tests_input_path, get_tests_path
 from TTS.utils.audio import numpy_transforms as np_transforms
 
 TESTS_PATH = get_tests_path()
-OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
-os.makedirs(OUT_PATH, exist_ok=True)
-
-
-# pylint: disable=no-self-use
-
 
 class TestNumpyTransforms(unittest.TestCase):
     def setUp(self) -> None:
diff --git a/tests/aux_tests/test_server.py b/tests/aux_tests/test_server.py
new file mode 100644
index 0000000000..1b691f9596
--- /dev/null
+++ b/tests/aux_tests/test_server.py
@@ -0,0 +1,47 @@
+import os
+import signal
+import socket
+import subprocess
+import time
+import wave
+
+import pytest
+import requests
+
+PORT = 5003
+
+
+def wait_for_server(host, port, timeout=30):
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            with socket.create_connection((host, port), timeout=2):
+                return True
+        except (OSError, ConnectionRefusedError):
+            time.sleep(1)
+    raise TimeoutError(f"Server at {host}:{port} did not start within {timeout} seconds.")
+
+
+@pytest.fixture(scope="module", autouse=True)
+def start_flask_server():
+    server_process = subprocess.Popen(
+        ["python", "-m", "TTS.server.server", "--port", str(PORT)],
+    )
+    wait_for_server("localhost", PORT)
+    yield
+    os.kill(server_process.pid, signal.SIGTERM)
+    server_process.wait()
+
+
+def test_flask_server(tmp_path):
+    url = f"http://localhost:{PORT}/api/tts?text=synthesis%20schmynthesis"
+    response = requests.get(url)
+    assert response.status_code == 200, f"Request failed with status code {response.status_code}"
+
+    wav_path = tmp_path / "output.wav"
+    with wav_path.open("wb") as f:
+        f.write(response.content)
+
+    with wave.open(str(wav_path), "rb") as wav_file:
+        num_frames = wav_file.getnframes()
+        assert num_frames > 0, "WAV file contains no frames."
diff --git a/tests/aux_tests/test_speaker_encoder_train.py b/tests/aux_tests/test_speaker_encoder_train.py
index 5d8626faa6..0e15db2ab0 100644
--- a/tests/aux_tests/test_speaker_encoder_train.py
+++ b/tests/aux_tests/test_speaker_encoder_train.py
@@ -1,88 +1,86 @@
-import glob
-import os
 import shutil
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig
 
 
-def run_test_train():
-    command = (
-        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
-        f"--coqpit.output_path {output_path} "
-        "--coqpit.datasets.0.formatter ljspeech_test "
-        "--coqpit.datasets.0.meta_file_train metadata.csv "
-        "--coqpit.datasets.0.meta_file_val metadata.csv "
-        "--coqpit.datasets.0.path tests/data/ljspeech "
+def test_train(tmp_path):
+    config_path = tmp_path / "test_speaker_encoder_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    def run_test_train():
+        command = (
+            f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
+            f"--coqpit.output_path {output_path} "
+            "--coqpit.datasets.0.formatter ljspeech_test "
+            "--coqpit.datasets.0.meta_file_train metadata.csv "
+            "--coqpit.datasets.0.meta_file_val metadata.csv "
+            "--coqpit.datasets.0.path tests/data/ljspeech "
+        )
+        run_cli(command)
+
+    config = SpeakerEncoderConfig(
+        batch_size=4,
+        num_classes_in_batch=4,
+        num_utter_per_class=2,
+        eval_num_classes_in_batch=4,
+        eval_num_utter_per_class=2,
+        num_loader_workers=1,
+        epochs=1,
+        print_step=1,
+        save_step=2,
+        print_eval=True,
+        run_eval=True,
+        audio=BaseAudioConfig(num_mels=80),
     )
-    run_cli(command)
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.loss = "ge2e"
+    config.save_json(config_path)
 
+    print(config)
+    # train the model for one epoch
+    run_test_train()
 
-config_path = os.path.join(get_tests_output_path(), "test_speaker_encoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-config = SpeakerEncoderConfig(
-    batch_size=4,
-    num_classes_in_batch=4,
-    num_utter_per_class=2,
-    eval_num_classes_in_batch=4,
-    eval_num_utter_per_class=2,
-    num_loader_workers=1,
-    epochs=1,
-    print_step=1,
-    save_step=2,
-    print_eval=True,
-    run_eval=True,
-    audio=BaseAudioConfig(num_mels=80),
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.loss = "ge2e"
-config.save_json(config_path)
-
-print(config)
-# train the model for one epoch
-run_test_train()
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
-
-# test resnet speaker encoder
-config.model_params["model_name"] = "resnet"
-config.save_json(config_path)
-
-# train the model for one epoch
-run_test_train()
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
 
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # test resnet speaker encoder
+    config.model_params["model_name"] = "resnet"
+    config.save_json(config_path)
 
-# test model with ge2e loss function
-# config.loss = "ge2e"
-# config.save_json(config_path)
-# run_test_train()
+    # train the model for one epoch
+    run_test_train()
 
-# test model with angleproto loss function
-# config.loss = "angleproto"
-# config.save_json(config_path)
-# run_test_train()
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# test model with softmaxproto loss function
-config.loss = "softmaxproto"
-config.save_json(config_path)
-run_test_train()
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
+
+    # test model with ge2e loss function
+    # config.loss = "ge2e"
+    # config.save_json(config_path)
+    # run_test_train()
+
+    # test model with angleproto loss function
+    # config.loss = "angleproto"
+    # config.save_json(config_path)
+    # run_test_train()
+
+    # test model with softmaxproto loss function
+    config.loss = "softmaxproto"
+    config.save_json(config_path)
+    run_test_train()
diff --git a/tests/bash_tests/test_compute_statistics.sh b/tests/bash_tests/test_compute_statistics.sh
deleted file mode 100755
index 721777f852..0000000000
--- a/tests/bash_tests/test_compute_statistics.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-set -xe
-BASEDIR=$(dirname "$0")
-echo "$BASEDIR"
-# run training
-CUDA_VISIBLE_DEVICES="" python TTS/bin/compute_statistics.py --config_path $BASEDIR/../inputs/test_glow_tts.json --out_path $BASEDIR/../outputs/scale_stats.npy
diff --git a/tests/bash_tests/test_demo_server.sh b/tests/bash_tests/test_demo_server.sh
deleted file mode 100755
index ebd0bc8b89..0000000000
--- a/tests/bash_tests/test_demo_server.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-set -xe
-
-python -m TTS.server.server &
-SERVER_PID=$!
-
-echo 'Waiting for server...'
-sleep 30
-
-curl -o /tmp/audio.wav "http://localhost:5002/api/tts?text=synthesis%20schmynthesis"
-python -c 'import sys; import wave; print(wave.open(sys.argv[1]).getnframes())' /tmp/audio.wav
-
-kill $SERVER_PID
-
-rm /tmp/audio.wav
diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py
index 252b429a16..f260af161e 100644
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@@ -1,12 +1,12 @@
 import os
 import shutil
-import unittest
 
 import numpy as np
+import pytest
 import torch
 from torch.utils.data import DataLoader
 
-from tests import get_tests_data_path, get_tests_output_path
+from tests import get_tests_data_path
 from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.datasets.dataset import TTSDataset
@@ -15,9 +15,6 @@
 
 # pylint: disable=unused-variable
 
-OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
-os.makedirs(OUTPATH, exist_ok=True)
-
 # create a dummy config for testing data loaders.
 c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False)
 c.r = 5
@@ -47,6 +44,9 @@
 
 dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac]
 
+ap = AudioProcessor(**c.audio)
+max_loader_iter = 4
+
 DATA_EXIST = True
 if not os.path.exists(c.data_path):
     DATA_EXIST = False
@@ -54,203 +54,200 @@
 print(" > Dynamic data loader test: {}".format(DATA_EXIST))
 
 
-class TestTTSDataset(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.max_loader_iter = 4
-        self.ap = AudioProcessor(**c.audio)
-
-    def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False):
-        # load dataset
-        meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
-        items = meta_data_train + meta_data_eval
-        tokenizer, _ = TTSTokenizer.init_from_config(c)
-        dataset = TTSDataset(
-            outputs_per_step=r,
-            compute_linear_spec=True,
-            return_wav=True,
-            tokenizer=tokenizer,
-            ap=self.ap,
-            samples=items,
-            batch_group_size=bgs,
-            min_text_len=c.min_text_len,
-            max_text_len=c.max_text_len,
-            min_audio_len=c.min_audio_len,
-            max_audio_len=c.max_audio_len,
-            start_by_longest=start_by_longest,
-        )
-
-        # add preprocess to force the length computation
-        if preprocess_samples:
-            dataset.preprocess_samples()
-
-        dataloader = DataLoader(
-            dataset,
-            batch_size=batch_size,
-            shuffle=False,
-            collate_fn=dataset.collate_fn,
-            drop_last=True,
-            num_workers=c.num_loader_workers,
-        )
-        return dataloader, dataset
-
-    def test_loader(self):
-        for dataset_config in dataset_configs:
-            dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True)
-            for i, data in enumerate(dataloader):
-                if i == self.max_loader_iter:
-                    break
-                text_input = data["token_id"]
-                _ = data["token_id_lengths"]
-                speaker_name = data["speaker_names"]
-                linear_input = data["linear"]
-                mel_input = data["mel"]
-                mel_lengths = data["mel_lengths"]
-                _ = data["stop_targets"]
-                _ = data["item_idxs"]
-                wavs = data["waveform"]
-
-                neg_values = text_input[text_input < 0]
-                check_count = len(neg_values)
-
-                # check basic conditions
-                self.assertEqual(check_count, 0)
-                self.assertEqual(linear_input.shape[0], mel_input.shape[0], c.batch_size)
-                self.assertEqual(linear_input.shape[2], self.ap.fft_size // 2 + 1)
-                self.assertEqual(mel_input.shape[2], c.audio["num_mels"])
-                self.assertEqual(wavs.shape[1], mel_input.shape[1] * c.audio.hop_length)
-                self.assertIsInstance(speaker_name[0], str)
-
-                # make sure that the computed mels and the waveform match and correctly computed
-                mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy())
-                # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
-                mel_new = mel_new[:, : mel_lengths[0]]
-                ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
-                mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg]
-                self.assertLess(abs(mel_diff.sum()), 1e-5)
-
-                # check normalization ranges
-                if self.ap.symmetric_norm:
-                    self.assertLessEqual(mel_input.max(), self.ap.max_norm)
-                    self.assertGreaterEqual(
-                        mel_input.min(), -self.ap.max_norm  # pylint: disable=invalid-unary-operand-type
-                    )
-                    self.assertLess(mel_input.min(), 0)
-                else:
-                    self.assertLessEqual(mel_input.max(), self.ap.max_norm)
-                    self.assertGreaterEqual(mel_input.min(), 0)
-
-    def test_batch_group_shuffle(self):
-        dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav)
-        last_length = 0
-        frames = dataset.samples
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            mel_lengths = data["mel_lengths"]
-            avg_length = mel_lengths.numpy().mean()
-        dataloader.dataset.preprocess_samples()
-        is_items_reordered = False
-        for idx, item in enumerate(dataloader.dataset.samples):
-            if item != frames[idx]:
-                is_items_reordered = True
-                break
-        self.assertGreaterEqual(avg_length, last_length)
-        self.assertTrue(is_items_reordered)
-
-    def test_start_by_longest(self):
-        """Test start_by_longest option.
-
-        Ther first item of the fist batch must be longer than all the other items.
-        """
-        dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True)
-        dataloader.dataset.preprocess_samples()
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            mel_lengths = data["mel_lengths"]
-            if i == 0:
-                max_len = mel_lengths[0]
-            print(mel_lengths)
-            self.assertTrue(all(max_len >= mel_lengths))
-
-    def test_padding_and_spectrograms(self):
-        def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths):
-            self.assertNotEqual(linear_input[idx, -1].sum(), 0)  # check padding
-            self.assertNotEqual(linear_input[idx, -2].sum(), 0)
-            self.assertNotEqual(mel_input[idx, -1].sum(), 0)
-            self.assertNotEqual(mel_input[idx, -2].sum(), 0)
-            self.assertEqual(stop_target[idx, -1], 1)
-            self.assertEqual(stop_target[idx, -2], 0)
-            self.assertEqual(stop_target[idx].sum(), 1)
-            self.assertEqual(len(mel_lengths.shape), 1)
-            self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0])
-            self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0])
-
-        dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav)
-
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            linear_input = data["linear"]
-            mel_input = data["mel"]
-            mel_lengths = data["mel_lengths"]
-            stop_target = data["stop_targets"]
-            item_idx = data["item_idxs"]
-
-            # check mel_spec consistency
-            wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32)
-            mel = self.ap.melspectrogram(wav).astype("float32")
-            mel = torch.FloatTensor(mel).contiguous()
-            mel_dl = mel_input[0]
-            # NOTE: Below needs to check == 0 but due to an unknown reason
-            # there is a slight difference between two matrices.
-            # TODO: Check this assert cond more in detail.
-            self.assertLess(abs(mel.T - mel_dl).max(), 1e-5)
-
-            # check mel-spec correctness
-            mel_spec = mel_input[0].cpu().numpy()
-            wav = self.ap.inv_melspectrogram(mel_spec.T)
-            self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav")
-            shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav")
-
-            # check linear-spec
-            linear_spec = linear_input[0].cpu().numpy()
-            wav = self.ap.inv_spectrogram(linear_spec.T)
-            self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav")
-            shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav")
-
-            # check the outputs
-            check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)
-
-        # Test for batch size 2
-        dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav)
-
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            linear_input = data["linear"]
-            mel_input = data["mel"]
-            mel_lengths = data["mel_lengths"]
-            stop_target = data["stop_targets"]
-            item_idx = data["item_idxs"]
-
-            # set id to the longest sequence in the batch
-            if mel_lengths[0] > mel_lengths[1]:
-                idx = 0
-            else:
-                idx = 1
-
-            # check the longer item in the batch
-            check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths)
-
-            # check the other item in the batch
-            self.assertEqual(linear_input[1 - idx, -1].sum(), 0)
-            self.assertEqual(mel_input[1 - idx, -1].sum(), 0)
-            self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1)
-            self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1])
-            self.assertEqual(len(mel_lengths.shape), 1)
-
-            # check batch zero-frame conditions (zero-frame disabled)
-            # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
-            # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
+def _create_dataloader(batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False):
+    # load dataset
+    meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
+    items = meta_data_train + meta_data_eval
+    tokenizer, _ = TTSTokenizer.init_from_config(c)
+    dataset = TTSDataset(
+        outputs_per_step=r,
+        compute_linear_spec=True,
+        return_wav=True,
+        tokenizer=tokenizer,
+        ap=ap,
+        samples=items,
+        batch_group_size=bgs,
+        min_text_len=c.min_text_len,
+        max_text_len=c.max_text_len,
+        min_audio_len=c.min_audio_len,
+        max_audio_len=c.max_audio_len,
+        start_by_longest=start_by_longest,
+    )
+
+    # add preprocess to force the length computation
+    if preprocess_samples:
+        dataset.preprocess_samples()
+
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=dataset.collate_fn,
+        drop_last=True,
+        num_workers=c.num_loader_workers,
+    )
+    return dataloader, dataset
+
+
+@pytest.mark.parametrize("dataset_config", dataset_configs)
+def test_loader(dataset_config: BaseDatasetConfig):
+    batch_size = 1
+    dataloader, _ = _create_dataloader(batch_size, 1, 0, dataset_config, preprocess_samples=True)
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        text_input = data["token_id"]
+        _ = data["token_id_lengths"]
+        speaker_name = data["speaker_names"]
+        linear_input = data["linear"]
+        mel_input = data["mel"]
+        mel_lengths = data["mel_lengths"]
+        _ = data["stop_targets"]
+        _ = data["item_idxs"]
+        wavs = data["waveform"]
+
+        neg_values = text_input[text_input < 0]
+        check_count = len(neg_values)
+
+        # check basic conditions
+        assert check_count == 0
+        assert linear_input.shape[0] == mel_input.shape[0] == batch_size
+        assert linear_input.shape[2] == ap.fft_size // 2 + 1
+        assert mel_input.shape[2] == c.audio["num_mels"]
+        assert wavs.shape[1] == mel_input.shape[1] * c.audio.hop_length
+        assert isinstance(speaker_name[0], str)
+
+        # make sure that the computed mels and the waveform match and correctly computed
+        mel_new = ap.melspectrogram(wavs[0].squeeze().numpy())
+        # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
+        mel_new = mel_new[:, : mel_lengths[0]]
+        ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
+        mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg]
+        assert abs(mel_diff.sum()) < 1e-5
+
+        # check normalization ranges
+        if ap.symmetric_norm:
+            assert mel_input.max() <= ap.max_norm
+            assert mel_input.min() >= -ap.max_norm
+            assert mel_input.min() < 0
+        else:
+            assert mel_input.max() <= ap.max_norm
+            assert mel_input.min() >= 0
+
+
+def test_batch_group_shuffle():
+    dataloader, dataset = _create_dataloader(2, c.r, 16, dataset_config_wav)
+    last_length = 0
+    frames = dataset.samples
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        mel_lengths = data["mel_lengths"]
+        avg_length = mel_lengths.numpy().mean()
+    dataloader.dataset.preprocess_samples()
+    is_items_reordered = False
+    for idx, item in enumerate(dataloader.dataset.samples):
+        if item != frames[idx]:
+            is_items_reordered = True
+            break
+    assert avg_length >= last_length
+    assert is_items_reordered
+
+
+def test_start_by_longest():
+    """Test start_by_longest option.
+
+    The first item of the fist batch must be longer than all the other items.
+    """
+    dataloader, _ = _create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True)
+    dataloader.dataset.preprocess_samples()
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        mel_lengths = data["mel_lengths"]
+        if i == 0:
+            max_len = mel_lengths[0]
+        print(mel_lengths)
+        assert all(max_len >= mel_lengths)
+
+
+def test_padding_and_spectrograms(tmp_path):
+    def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths):
+        assert linear_input[idx, -1].sum() != 0  # check padding
+        assert linear_input[idx, -2].sum() != 0
+        assert mel_input[idx, -1].sum() != 0
+        assert mel_input[idx, -2].sum() != 0
+        assert stop_target[idx, -1] == 1
+        assert stop_target[idx, -2] == 0
+        assert stop_target[idx].sum() == 1
+        assert len(mel_lengths.shape) == 1
+        assert mel_lengths[idx] == linear_input[idx].shape[0]
+        assert mel_lengths[idx] == mel_input[idx].shape[0]
+
+    dataloader, _ = _create_dataloader(1, 1, 0, dataset_config_wav)
+
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        linear_input = data["linear"]
+        mel_input = data["mel"]
+        mel_lengths = data["mel_lengths"]
+        stop_target = data["stop_targets"]
+        item_idx = data["item_idxs"]
+
+        # check mel_spec consistency
+        wav = np.asarray(ap.load_wav(item_idx[0]), dtype=np.float32)
+        mel = ap.melspectrogram(wav).astype("float32")
+        mel = torch.FloatTensor(mel).contiguous()
+        mel_dl = mel_input[0]
+        # NOTE: Below needs to check == 0 but due to an unknown reason
+        # there is a slight difference between two matrices.
+        # TODO: Check this assert cond more in detail.
+        assert abs(mel.T - mel_dl).max() < 1e-5
+
+        # check mel-spec correctness
+        mel_spec = mel_input[0].cpu().numpy()
+        wav = ap.inv_melspectrogram(mel_spec.T)
+        ap.save_wav(wav, tmp_path / "mel_inv_dataloader.wav")
+        shutil.copy(item_idx[0], tmp_path / "mel_target_dataloader.wav")
+
+        # check linear-spec
+        linear_spec = linear_input[0].cpu().numpy()
+        wav = ap.inv_spectrogram(linear_spec.T)
+        ap.save_wav(wav, tmp_path / "linear_inv_dataloader.wav")
+        shutil.copy(item_idx[0], tmp_path / "linear_target_dataloader.wav")
+
+        # check the outputs
+        check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)
+
+    # Test for batch size 2
+    dataloader, _ = _create_dataloader(2, 1, 0, dataset_config_wav)
+
+    for i, data in enumerate(dataloader):
+        if i == max_loader_iter:
+            break
+        linear_input = data["linear"]
+        mel_input = data["mel"]
+        mel_lengths = data["mel_lengths"]
+        stop_target = data["stop_targets"]
+        item_idx = data["item_idxs"]
+
+        # set id to the longest sequence in the batch
+        if mel_lengths[0] > mel_lengths[1]:
+            idx = 0
+        else:
+            idx = 1
+
+        # check the longer item in the batch
+        check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths)
+
+        # check the other item in the batch
+        assert linear_input[1 - idx, -1].sum() == 0
+        assert mel_input[1 - idx, -1].sum() == 0
+        assert stop_target[1, mel_lengths[1] - 1] == 1
+        assert stop_target[1, mel_lengths[1] :].sum() == stop_target.shape[1] - mel_lengths[1]
+        assert len(mel_lengths.shape) == 1
+
+        # check batch zero-frame conditions (zero-frame disabled)
+        # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
+        # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py
index 28a4088c96..beb7df689b 100644
--- a/tests/inference_tests/test_synthesize.py
+++ b/tests/inference_tests/test_synthesize.py
@@ -1,20 +1,17 @@
-import os
+from tests import run_main
+from TTS.bin.synthesize import main
 
-from tests import get_tests_output_path, run_cli
 
-
-def test_synthesize():
+def test_synthesize(tmp_path):
     """Test synthesize.py with diffent arguments."""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    run_cli("tts --list_models")
+    output_path = str(tmp_path / "output.wav")
+
+    run_main(main, ["--list_models"])
 
     # single speaker model
-    run_cli(f'tts --text "This is an example." --out_path "{output_path}"')
-    run_cli(
-        "tts --model_name tts_models/en/ljspeech/glow-tts " f'--text "This is an example." --out_path "{output_path}"'
-    )
-    run_cli(
-        "tts --model_name tts_models/en/ljspeech/glow-tts  "
-        "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan "
-        f'--text "This is an example." --out_path "{output_path}"'
-    )
+    args = ["--text", "This is an example.", "--out_path", output_path]
+    run_main(main, args)
+
+    args = [*args, "--model_name", "tts_models/en/ljspeech/glow-tts"]
+    run_main(main, args)
+    run_main(main, [*args, "--vocoder_name", "vocoder_models/en/ljspeech/multiband-melgan"])
diff --git a/tests/inputs/test_align_tts.json b/tests/inputs/test_align_tts_config.json
similarity index 100%
rename from tests/inputs/test_align_tts.json
rename to tests/inputs/test_align_tts_config.json
diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts_config.json
similarity index 100%
rename from tests/inputs/test_glow_tts.json
rename to tests/inputs/test_glow_tts_config.json
diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech_config.json
similarity index 100%
rename from tests/inputs/test_speedy_speech.json
rename to tests/inputs/test_speedy_speech_config.json
diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad_config.json
similarity index 100%
rename from tests/inputs/test_vocoder_wavegrad.json
rename to tests/inputs/test_vocoder_wavegrad_config.json
diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py
index 9be1f0bf41..25c169eddd 100644
--- a/tests/text_tests/test_text_cleaners.py
+++ b/tests/text_tests/test_text_cleaners.py
@@ -24,6 +24,8 @@ def test_currency() -> None:
 def test_expand_numbers() -> None:
     assert phoneme_cleaners("-1") == "minus one"
     assert phoneme_cleaners("1") == "one"
+    assert phoneme_cleaners("1" + "0" * 35) == "one hundred decillion"
+    assert phoneme_cleaners("1" + "0" * 36) == "one" + " zero" * 36
 
 
 def test_multilingual_phoneme_cleaners() -> None:
diff --git a/tests/tts_tests/test_neuralhmm_tts_train.py b/tests/tts_tests/test_neuralhmm_tts_train.py
index 4789d53d9e..f4b8d5cadd 100644
--- a/tests/tts_tests/test_neuralhmm_tts_train.py
+++ b/tests/tts_tests/test_neuralhmm_tts_train.py
@@ -1,92 +1,92 @@
-import glob
 import json
-import os
 import shutil
 
 import torch
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.neuralhmm_tts_config import NeuralhmmTTSConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt")
 
-torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+    parameter_path = tmp_path / "lj_parameters.pt"
 
-config = NeuralhmmTTSConfig(
-    batch_size=3,
-    eval_batch_size=3,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="phoneme_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    mel_statistics_parameter_path=parameter_path,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    max_sampling_time=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path)
 
+    config = NeuralhmmTTSConfig(
+        batch_size=3,
+        eval_batch_size=3,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="phoneme_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        mel_statistics_parameter_path=parameter_path,
+        epochs=1,
+        print_step=1,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        print_eval=True,
+        max_sampling_time=50,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch when mel parameters exists
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    # train the model for one epoch when mel parameters exists
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
+    # train the model for one epoch when mel parameters have to be computed from the dataset
+    if parameter_path.is_file():
+        parameter_path.unlink()
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
-# train the model for one epoch when mel parameters have to be computed from the dataset
-if os.path.exists(parameter_path):
-    os.remove(parameter_path)
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_overflow_train.py b/tests/tts_tests/test_overflow_train.py
index d86bde6854..e2dec3c899 100644
--- a/tests/tts_tests/test_overflow_train.py
+++ b/tests/tts_tests/test_overflow_train.py
@@ -1,92 +1,92 @@
-import glob
 import json
-import os
 import shutil
 
 import torch
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.overflow_config import OverflowConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt")
 
-torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+    parameter_path = tmp_path / "lj_parameters.pt"
 
-config = OverflowConfig(
-    batch_size=3,
-    eval_batch_size=3,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="phoneme_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    mel_statistics_parameter_path=parameter_path,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    max_sampling_time=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path)
 
+    config = OverflowConfig(
+        batch_size=3,
+        eval_batch_size=3,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="phoneme_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        mel_statistics_parameter_path=parameter_path,
+        epochs=1,
+        print_step=1,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        print_eval=True,
+        max_sampling_time=50,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch when mel parameters exists
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    # train the model for one epoch when mel parameters exists
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
+    # train the model for one epoch when mel parameters have to be computed from the dataset
+    if parameter_path.is_file():
+        parameter_path.unlink()
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
-# train the model for one epoch when mel parameters have to be computed from the dataset
-if os.path.exists(parameter_path):
-    os.remove(parameter_path)
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py
index 2aac7f101d..30efe38d9f 100644
--- a/tests/tts_tests/test_speedy_speech_train.py
+++ b/tests/tts_tests/test_speedy_speech_train.py
@@ -1,72 +1,73 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_speedy_speech_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = SpeedySpeechConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = SpeedySpeechConfig(
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py
index d2d1d5c35f..191e0a19ee 100644
--- a/tests/tts_tests/test_tacotron2_d-vectors_train.py
+++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py
@@ -1,79 +1,81 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-config = Tacotron2Config(
-    r=5,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    use_speaker_embedding=False,
-    use_d_vector_file=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    d_vector_file="tests/data/ljspeech/speakers.json",
-    d_vector_dim=256,
-    max_decoder_steps=50,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = Tacotron2Config(
+        r=5,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=False,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        use_speaker_embedding=False,
+        use_d_vector_file=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        d_vector_file="tests/data/ljspeech/speakers.json",
+        d_vector_dim=256,
+        max_decoder_steps=50,
+    )
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = config.d_vector_file
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = config.d_vector_file
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with open(continue_config_path, "r", encoding="utf-8") as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py
index 83a07d1a6c..2696edb1b6 100644
--- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py
+++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py
@@ -1,77 +1,79 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-config = Tacotron2Config(
-    r=5,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    use_speaker_embedding=True,
-    num_speakers=4,
-    max_decoder_steps=50,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = Tacotron2Config(
+        r=5,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=False,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        use_speaker_embedding=True,
+        num_speakers=4,
+        max_decoder_steps=50,
+    )
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = continue_path / "speakers.json"
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py
index df0e934d8e..f8667b6d02 100644
--- a/tests/tts_tests/test_tacotron2_train.py
+++ b/tests/tts_tests/test_tacotron2_train.py
@@ -1,72 +1,72 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-config = Tacotron2Config(
-    r=5,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    max_decoder_steps=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    config = Tacotron2Config(
+        r=5,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=False,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        print_eval=True,
+        max_decoder_steps=50,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py
index 17f1fd46a6..cc91b18c34 100644
--- a/tests/tts_tests/test_tacotron_train.py
+++ b/tests/tts_tests/test_tacotron_train.py
@@ -1,64 +1,63 @@
-import glob
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.tacotron_config import TacotronConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-
-config = TacotronConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    r=5,
-    max_decoder_steps=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    config = TacotronConfig(
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=False,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        print_eval=True,
+        r=5,
+        max_decoder_steps=50,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py
index 741bda91e9..b95e1deed3 100644
--- a/tests/tts_tests/test_vits_d-vectors_train.py
+++ b/tests/tts_tests/test_vits_d-vectors_train.py
@@ -1,61 +1,61 @@
-import glob
-import os
 import shutil
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.vits_config import VitsConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech-0"],
-    ],
-)
-# set audio config
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-
-# active multispeaker d-vec mode
-config.model_args.use_d_vector_file = True
-config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
-config.model_args.d_vector_dim = 256
-
-
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech-0"],
+        ],
+    )
+    # set audio config
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+
+    # active multispeaker d-vec mode
+    config.model_args.use_d_vector_file = True
+    config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
+    config.model_args.d_vector_dim = 256
+
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py
index 09df7d29f2..189e6cfb4d 100644
--- a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py
+++ b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py
@@ -1,110 +1,111 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-dataset_config_en = BaseDatasetConfig(
-    formatter="ljspeech",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    path="tests/data/ljspeech",
-    language="en",
-)
-
-dataset_config_pt = BaseDatasetConfig(
-    formatter="ljspeech",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    path="tests/data/ljspeech",
-    language="pt-br",
-)
-
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech", None, "en"],
-        ["Be a voice, not an echo.", "ljspeech", None, "pt-br"],
-    ],
-    datasets=[dataset_config_en, dataset_config_pt],
-)
-# set audio config
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-
-# active multilingual mode
-config.model_args.use_language_embedding = True
-config.use_language_embedding = True
-# active multispeaker mode
-config.model_args.use_speaker_embedding = True
-config.use_speaker_embedding = True
-
-# deactivate multispeaker d-vec mode
-config.model_args.use_d_vector_file = False
-config.use_d_vector_file = False
-
-# duration predictor
-config.model_args.use_sdp = False
-config.use_sdp = False
-
-# active language sampler
-config.use_language_weighted_sampler = True
-
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech"
-languae_id = "en"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
-continue_languages_path = os.path.join(continue_path, "language_ids.json")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    dataset_config_en = BaseDatasetConfig(
+        formatter="ljspeech",
+        meta_file_train="metadata.csv",
+        meta_file_val="metadata.csv",
+        path="tests/data/ljspeech",
+        language="en",
+    )
+
+    dataset_config_pt = BaseDatasetConfig(
+        formatter="ljspeech",
+        meta_file_train="metadata.csv",
+        meta_file_val="metadata.csv",
+        path="tests/data/ljspeech",
+        language="pt-br",
+    )
+
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech", None, "en"],
+            ["Be a voice, not an echo.", "ljspeech", None, "pt-br"],
+        ],
+        datasets=[dataset_config_en, dataset_config_pt],
+    )
+    # set audio config
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+
+    # active multilingual mode
+    config.model_args.use_language_embedding = True
+    config.use_language_embedding = True
+    # active multispeaker mode
+    config.model_args.use_speaker_embedding = True
+    config.use_speaker_embedding = True
+
+    # deactivate multispeaker d-vec mode
+    config.model_args.use_d_vector_file = False
+    config.use_d_vector_file = False
+
+    # duration predictor
+    config.model_args.use_sdp = False
+    config.use_sdp = False
+
+    # active language sampler
+    config.use_language_weighted_sampler = True
+
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech"
+    languae_id = "en"
+    continue_speakers_path = continue_path / "speakers.json"
+    continue_languages_path = continue_path / "language_ids.json"
+
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
+
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
index 7ae09c0e5c..8b8757422c 100644
--- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
+++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
@@ -1,117 +1,117 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-dataset_config_en = BaseDatasetConfig(
-    formatter="ljspeech_test",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    path="tests/data/ljspeech",
-    language="en",
-)
-
-dataset_config_pt = BaseDatasetConfig(
-    formatter="ljspeech_test",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    path="tests/data/ljspeech",
-    language="pt-br",
-)
-
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="multilingual_cleaners",
-    use_phonemes=False,
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech-0", None, "en"],
-        ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"],
-    ],
-    datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt],
-)
-# set audio config
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-
-# active multilingual mode
-config.model_args.use_language_embedding = True
-config.use_language_embedding = True
-
-# deactivate multispeaker mode
-config.model_args.use_speaker_embedding = False
-config.use_speaker_embedding = False
-
-# active multispeaker d-vec mode
-config.model_args.use_d_vector_file = True
-config.use_d_vector_file = True
-config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
-config.d_vector_file = ["tests/data/ljspeech/speakers.json"]
-config.model_args.d_vector_dim = 256
-config.d_vector_dim = 256
-
-# duration predictor
-config.model_args.use_sdp = True
-config.use_sdp = True
-
-# activate language and speaker samplers
-config.use_language_weighted_sampler = True
-config.language_weighted_sampler_alpha = 10
-config.use_speaker_weighted_sampler = True
-config.speaker_weighted_sampler_alpha = 5
-
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-languae_id = "en"
-continue_speakers_path = config.d_vector_file
-continue_languages_path = os.path.join(continue_path, "language_ids.json")
-
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    dataset_config_en = BaseDatasetConfig(
+        formatter="ljspeech_test",
+        meta_file_train="metadata.csv",
+        meta_file_val="metadata.csv",
+        path="tests/data/ljspeech",
+        language="en",
+    )
+
+    dataset_config_pt = BaseDatasetConfig(
+        formatter="ljspeech_test",
+        meta_file_train="metadata.csv",
+        meta_file_val="metadata.csv",
+        path="tests/data/ljspeech",
+        language="pt-br",
+    )
+
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="multilingual_cleaners",
+        use_phonemes=False,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech-0", None, "en"],
+            ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"],
+        ],
+        datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt],
+    )
+    # set audio config
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+
+    # active multilingual mode
+    config.model_args.use_language_embedding = True
+    config.use_language_embedding = True
+
+    # deactivate multispeaker mode
+    config.model_args.use_speaker_embedding = False
+    config.use_speaker_embedding = False
+
+    # active multispeaker d-vec mode
+    config.model_args.use_d_vector_file = True
+    config.use_d_vector_file = True
+    config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
+    config.d_vector_file = ["tests/data/ljspeech/speakers.json"]
+    config.model_args.d_vector_dim = 256
+    config.d_vector_dim = 256
+
+    # duration predictor
+    config.model_args.use_sdp = True
+    config.use_sdp = True
+
+    # activate language and speaker samplers
+    config.use_language_weighted_sampler = True
+    config.language_weighted_sampler_alpha = 10
+    config.use_speaker_weighted_sampler = True
+    config.speaker_weighted_sampler_alpha = 5
+
+    config.save_json(config_path)
+
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    languae_id = "en"
+    continue_speakers_path = config.d_vector_file
+    continue_languages_path = continue_path / "language_ids.json"
+
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
+
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py
index 69fae21f8d..6678cca90c 100644
--- a/tests/tts_tests/test_vits_speaker_emb_train.py
+++ b/tests/tts_tests/test_vits_speaker_emb_train.py
@@ -1,83 +1,83 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.vits_config import VitsConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech-1"],
-    ],
-)
-# set audio config
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech-1"],
+        ],
+    )
+    # set audio config
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
 
-# active multispeaker d-vec mode
-config.model_args.use_speaker_embedding = True
-config.model_args.use_d_vector_file = False
-config.model_args.d_vector_file = None
-config.model_args.d_vector_dim = 256
+    # active multispeaker d-vec mode
+    config.model_args.use_speaker_embedding = True
+    config.model_args.use_d_vector_file = False
+    config.model_args.d_vector_file = None
+    config.model_args.d_vector_dim = 256
 
+    config.save_json(config_path)
 
-config.save_json(config_path)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = continue_path / "speakers.json"
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py
index 78f42d154b..e0f7a656b0 100644
--- a/tests/tts_tests/test_vits_train.py
+++ b/tests/tts_tests/test_vits_train.py
@@ -1,72 +1,73 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.vits_config import VitsConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = VitsConfig(
-    batch_size=2,
-    eval_batch_size=2,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo."],
-    ],
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = VitsConfig(
+        batch_size=2,
+        eval_batch_size=2,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo."],
+        ],
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests2/test_align_tts_train.py b/tests/tts_tests2/test_align_tts_train.py
index 91c3c35bc6..1582f51fd4 100644
--- a/tests/tts_tests2/test_align_tts_train.py
+++ b/tests/tts_tests2/test_align_tts_train.py
@@ -1,72 +1,71 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.align_tts_config import AlignTTSConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = AlignTTSConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-)
+    config = AlignTTSConfig(
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=False,
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+    )
 
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.test_delay_epochs 0 "
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(tmp_path)
diff --git a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py
index 1e5cd49f73..74d7a0a734 100644
--- a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py
+++ b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py
@@ -1,100 +1,98 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig
 from TTS.tts.models.delightful_tts import DelightfulTtsArgs, VocoderConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-
-audio_config = DelightfulTtsAudioConfig()
-model_args = DelightfulTtsArgs(
-    use_speaker_embedding=False, d_vector_dim=256, use_d_vector_file=True, speaker_embedding_channels=256
-)
-
-vocoder_config = VocoderConfig()
-
-config = DelightfulTTSConfig(
-    model_args=model_args,
-    audio=audio_config,
-    vocoder=vocoder_config,
-    batch_size=2,
-    eval_batch_size=8,
-    compute_f0=True,
-    run_eval=True,
-    test_delay_epochs=-1,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache_delightful/",  ## delightful f0 cache is incompatible with other models
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    binary_align_loss_alpha=0.0,
-    use_attn_priors=False,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech-0"],
-    ],
-    output_path=output_path,
-    use_speaker_embedding=False,
-    use_d_vector_file=True,
-    d_vector_file="tests/data/ljspeech/speakers.json",
-    d_vector_dim=256,
-    speaker_embedding_channels=256,
-)
-
-# active multispeaker d-vec mode
-config.model_args.use_speaker_embedding = False
-config.model_args.use_d_vector_file = True
-config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
-config.model_args.d_vector_dim = 256
-
-
-config.save_json(config_path)
-
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-speaker_id = "ljspeech-1"
-continue_speakers_path = config.d_vector_file
-
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --speakers_file_path {continue_speakers_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
-shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/")
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    audio_config = DelightfulTtsAudioConfig()
+    model_args = DelightfulTtsArgs(
+        use_speaker_embedding=False, d_vector_dim=256, use_d_vector_file=True, speaker_embedding_channels=256
+    )
+
+    vocoder_config = VocoderConfig()
+
+    config = DelightfulTTSConfig(
+        model_args=model_args,
+        audio=audio_config,
+        vocoder=vocoder_config,
+        batch_size=2,
+        eval_batch_size=8,
+        compute_f0=True,
+        run_eval=True,
+        test_delay_epochs=-1,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        f0_cache_path=tmp_path / "f0_cache",  # delightful f0 cache is incompatible with other models
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        binary_align_loss_alpha=0.0,
+        use_attn_priors=False,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech-0"],
+        ],
+        output_path=output_path,
+        use_speaker_embedding=False,
+        use_d_vector_file=True,
+        d_vector_file="tests/data/ljspeech/speakers.json",
+        d_vector_dim=256,
+        speaker_embedding_channels=256,
+    )
+
+    # active multispeaker d-vec mode
+    config.model_args.use_speaker_embedding = False
+    config.model_args.use_d_vector_file = True
+    config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
+    config.model_args.d_vector_dim = 256
+    config.save_json(config_path)
+
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = config.d_vector_file
+
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
+
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --speakers_file_path {continue_speakers_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_delightful_tts_emb_spk.py b/tests/tts_tests2/test_delightful_tts_emb_spk.py
index 9bbf7a55ea..68f790599e 100644
--- a/tests/tts_tests2/test_delightful_tts_emb_spk.py
+++ b/tests/tts_tests2/test_delightful_tts_emb_spk.py
@@ -1,94 +1,93 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig
 from TTS.tts.models.delightful_tts import DelightfulTtsArgs, VocoderConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-
-audio_config = DelightfulTtsAudioConfig()
-model_args = DelightfulTtsArgs(use_speaker_embedding=False)
-
-vocoder_config = VocoderConfig()
-
-config = DelightfulTTSConfig(
-    model_args=model_args,
-    audio=audio_config,
-    vocoder=vocoder_config,
-    batch_size=2,
-    eval_batch_size=8,
-    compute_f0=True,
-    run_eval=True,
-    test_delay_epochs=-1,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache_delightful/",  ## delightful f0 cache is incompatible with other models
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    binary_align_loss_alpha=0.0,
-    use_attn_priors=False,
-    test_sentences=[
-        ["Be a voice, not an echo.", "ljspeech"],
-    ],
-    output_path=output_path,
-    num_speakers=4,
-    use_speaker_embedding=True,
-)
-
-# active multispeaker d-vec mode
-config.model_args.use_speaker_embedding = True
-config.model_args.use_d_vector_file = False
-config.model_args.d_vector_file = None
-config.model_args.d_vector_dim = 256
-
-
-config.save_json(config_path)
-
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.dataset_name ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech"
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
-
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
-
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
-shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/")
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
+
+    audio_config = DelightfulTtsAudioConfig()
+    model_args = DelightfulTtsArgs(use_speaker_embedding=False)
+
+    vocoder_config = VocoderConfig()
+
+    config = DelightfulTTSConfig(
+        model_args=model_args,
+        audio=audio_config,
+        vocoder=vocoder_config,
+        batch_size=2,
+        eval_batch_size=8,
+        compute_f0=True,
+        run_eval=True,
+        test_delay_epochs=-1,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        f0_cache_path=tmp_path / "f0_cache",  # delightful f0 cache is incompatible with other models
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        binary_align_loss_alpha=0.0,
+        use_attn_priors=False,
+        test_sentences=[
+            ["Be a voice, not an echo.", "ljspeech"],
+        ],
+        output_path=output_path,
+        num_speakers=4,
+        use_speaker_embedding=True,
+    )
+
+    # active multispeaker d-vec mode
+    config.model_args.use_speaker_embedding = True
+    config.model_args.use_d_vector_file = False
+    config.model_args.d_vector_file = None
+    config.model_args.d_vector_dim = 256
+    config.save_json(config_path)
+
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.dataset_name ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+
+    run_cli(command_train)
+
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
+
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech"
+
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
+
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_delightful_tts_train.py b/tests/tts_tests2/test_delightful_tts_train.py
index 3e6fbd2e86..4676ee4869 100644
--- a/tests/tts_tests2/test_delightful_tts_train.py
+++ b/tests/tts_tests2/test_delightful_tts_train.py
@@ -1,97 +1,97 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.delightful_tts_config import DelightfulTTSConfig
 from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-audio_config = DelightfulTtsAudioConfig()
-model_args = DelightfulTtsArgs()
+    audio_config = BaseAudioConfig(
+        sample_rate=22050,
+        do_trim_silence=True,
+        trim_db=60.0,
+        signal_norm=False,
+        mel_fmin=0.0,
+        mel_fmax=8000,
+        spec_gain=1.0,
+        log_func="np.log",
+        ref_level_db=20,
+        preemphasis=0.0,
+    )
 
-vocoder_config = VocoderConfig()
+    audio_config = DelightfulTtsAudioConfig()
+    model_args = DelightfulTtsArgs()
 
+    vocoder_config = VocoderConfig()
 
-config = DelightfulTTSConfig(
-    audio=audio_config,
-    batch_size=2,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache_delightful/",  ## delightful f0 cache is incompatible with other models
-    run_eval=True,
-    test_delay_epochs=-1,
-    binary_align_loss_alpha=0.0,
-    epochs=1,
-    print_step=1,
-    use_attn_priors=False,
-    print_eval=True,
-    test_sentences=[
-        ["Be a voice, not an echo."],
-    ],
-    use_speaker_embedding=False,
-)
-config.save_json(config_path)
+    config = DelightfulTTSConfig(
+        audio=audio_config,
+        batch_size=2,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        f0_cache_path=tmp_path / "f0_cache",  # delightful f0 cache is incompatible with other models
+        run_eval=True,
+        test_delay_epochs=-1,
+        binary_align_loss_alpha=0.0,
+        epochs=1,
+        print_step=1,
+        use_attn_priors=False,
+        print_eval=True,
+        test_sentences=[
+            ["Be a voice, not an echo."],
+        ],
+        use_speaker_embedding=False,
+    )
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{'cpu'}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs -1"
-)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{'cpu'}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs -1"
+    )
 
-run_cli(command_train)
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == -1
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == -1
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
-shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/")
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py
index e6bc9f9feb..379e2f346b 100644
--- a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py
+++ b/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py
@@ -1,92 +1,94 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.fast_pitch_config import FastPitchConfig
 
-config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "fast_pitch_speaker_emb_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = FastPitchConfig(
-    audio=audio_config,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    use_speaker_embedding=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-)
-config.audio.do_trim_silence = True
-config.use_speaker_embedding = True
-config.model_args.use_speaker_embedding = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    audio_config = BaseAudioConfig(
+        sample_rate=22050,
+        do_trim_silence=True,
+        trim_db=60.0,
+        signal_norm=False,
+        mel_fmin=0.0,
+        mel_fmax=8000,
+        spec_gain=1.0,
+        log_func="np.log",
+        ref_level_db=20,
+        preemphasis=0.0,
+    )
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    config = FastPitchConfig(
+        audio=audio_config,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        f0_cache_path="tests/data/ljspeech/f0_cache/",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        use_speaker_embedding=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+    )
+    config.audio.do_trim_silence = True
+    config.use_speaker_embedding = True
+    config.model_args.use_speaker_embedding = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = continue_path / "speakers.json"
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_fast_pitch_train.py b/tests/tts_tests2/test_fast_pitch_train.py
index fe87c8b600..e0838a2049 100644
--- a/tests/tts_tests2/test_fast_pitch_train.py
+++ b/tests/tts_tests2/test_fast_pitch_train.py
@@ -1,91 +1,93 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.fast_pitch_config import FastPitchConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = FastPitchConfig(
-    audio=audio_config,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    use_speaker_embedding=False,
-)
-config.audio.do_trim_silence = True
-config.use_speaker_embedding = False
-config.model_args.use_speaker_embedding = False
-config.audio.trim_db = 60
-config.save_json(config_path)
+    audio_config = BaseAudioConfig(
+        sample_rate=22050,
+        do_trim_silence=True,
+        trim_db=60.0,
+        signal_norm=False,
+        mel_fmin=0.0,
+        mel_fmax=8000,
+        spec_gain=1.0,
+        log_func="np.log",
+        ref_level_db=20,
+        preemphasis=0.0,
+    )
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
+    config = FastPitchConfig(
+        audio=audio_config,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        f0_cache_path="tests/data/ljspeech/f0_cache/",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        use_speaker_embedding=False,
+    )
+    config.audio.do_trim_silence = True
+    config.use_speaker_embedding = False
+    config.model_args.use_speaker_embedding = False
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py b/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py
index 735d2fc4c6..348729c6f4 100644
--- a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py
+++ b/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py
@@ -1,95 +1,97 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
 
-config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "fast_pitch_speaker_emb_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = Fastspeech2Config(
-    audio=audio_config,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache/",
-    compute_f0=True,
-    compute_energy=True,
-    energy_cache_path="tests/data/ljspeech/energy_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    use_speaker_embedding=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-)
-config.audio.do_trim_silence = True
-config.use_speaker_embedding = True
-config.model_args.use_speaker_embedding = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    audio_config = BaseAudioConfig(
+        sample_rate=22050,
+        do_trim_silence=True,
+        trim_db=60.0,
+        signal_norm=False,
+        mel_fmin=0.0,
+        mel_fmax=8000,
+        spec_gain=1.0,
+        log_func="np.log",
+        ref_level_db=20,
+        preemphasis=0.0,
+    )
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    config = Fastspeech2Config(
+        audio=audio_config,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        f0_cache_path="tests/data/ljspeech/f0_cache/",
+        compute_f0=True,
+        compute_energy=True,
+        energy_cache_path=tmp_path / "energy_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        use_speaker_embedding=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+    )
+    config.audio.do_trim_silence = True
+    config.use_speaker_embedding = True
+    config.model_args.use_speaker_embedding = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = continue_path / "speakers.json"
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_fastspeech_2_train.py b/tests/tts_tests2/test_fastspeech_2_train.py
index 07fc5a1a2c..ab513ec827 100644
--- a/tests/tts_tests2/test_fastspeech_2_train.py
+++ b/tests/tts_tests2/test_fastspeech_2_train.py
@@ -1,94 +1,96 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    do_trim_silence=True,
-    trim_db=60.0,
-    signal_norm=False,
-    mel_fmin=0.0,
-    mel_fmax=8000,
-    spec_gain=1.0,
-    log_func="np.log",
-    ref_level_db=20,
-    preemphasis=0.0,
-)
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = Fastspeech2Config(
-    audio=audio_config,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    f0_cache_path="tests/data/ljspeech/f0_cache/",
-    compute_f0=True,
-    compute_energy=True,
-    energy_cache_path="tests/data/ljspeech/energy_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    use_speaker_embedding=False,
-)
-config.audio.do_trim_silence = True
-config.use_speaker_embedding = False
-config.model_args.use_speaker_embedding = False
-config.audio.trim_db = 60
-config.save_json(config_path)
+    audio_config = BaseAudioConfig(
+        sample_rate=22050,
+        do_trim_silence=True,
+        trim_db=60.0,
+        signal_norm=False,
+        mel_fmin=0.0,
+        mel_fmax=8000,
+        spec_gain=1.0,
+        log_func="np.log",
+        ref_level_db=20,
+        preemphasis=0.0,
+    )
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
+    config = Fastspeech2Config(
+        audio=audio_config,
+        batch_size=8,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        f0_cache_path="tests/data/ljspeech/f0_cache/",
+        compute_f0=True,
+        compute_energy=True,
+        energy_cache_path=output_path / "energy_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        use_speaker_embedding=False,
+    )
+    config.audio.do_trim_silence = True
+    config.use_speaker_embedding = False
+    config.model_args.use_speaker_embedding = False
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    run_cli(command_train)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
+
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_glow_tts_d-vectors_train.py b/tests/tts_tests2/test_glow_tts_d-vectors_train.py
index 8236607c25..f03139ac77 100644
--- a/tests/tts_tests2/test_glow_tts_d-vectors_train.py
+++ b/tests/tts_tests2/test_glow_tts_d-vectors_train.py
@@ -1,79 +1,80 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.glow_tts_config import GlowTTSConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = GlowTTSConfig(
-    batch_size=2,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    data_dep_init_steps=1.0,
-    use_speaker_embedding=False,
-    use_d_vector_file=True,
-    d_vector_file="tests/data/ljspeech/speakers.json",
-    d_vector_dim=256,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = GlowTTSConfig(
+        batch_size=2,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=output_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        data_dep_init_steps=1.0,
+        use_speaker_embedding=False,
+        use_d_vector_file=True,
+        d_vector_file="tests/data/ljspeech/speakers.json",
+        d_vector_dim=256,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = config.d_vector_file
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = config.d_vector_file
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py b/tests/tts_tests2/test_glow_tts_speaker_emb_train.py
index 4a8bd0658d..b9fe93a2fa 100644
--- a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py
+++ b/tests/tts_tests2/test_glow_tts_speaker_emb_train.py
@@ -1,76 +1,77 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.glow_tts_config import GlowTTSConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = GlowTTSConfig(
-    batch_size=2,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    data_dep_init_steps=1.0,
-    use_speaker_embedding=True,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = GlowTTSConfig(
+        batch_size=2,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        data_dep_init_steps=1.0,
+        use_speaker_embedding=True,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech_test "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech_test "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
-speaker_id = "ljspeech-1"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
+    speaker_id = "ljspeech-1"
+    continue_speakers_path = continue_path / "speakers.json"
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/tts_tests2/test_glow_tts_train.py b/tests/tts_tests2/test_glow_tts_train.py
index 1d7f913575..3f1bf3a794 100644
--- a/tests/tts_tests2/test_glow_tts_train.py
+++ b/tests/tts_tests2/test_glow_tts_train.py
@@ -1,73 +1,74 @@
-import glob
 import json
-import os
 import shutil
 
 from trainer.io import get_last_checkpoint
 
-from tests import get_device_id, get_tests_output_path, run_cli
+from tests import get_device_id, run_cli
 from TTS.tts.configs.glow_tts_config import GlowTTSConfig
 
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
+def test_train(tmp_path):
+    config_path = tmp_path / "test_model_config.json"
+    output_path = tmp_path / "train_outputs"
 
-config = GlowTTSConfig(
-    batch_size=2,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=True,
-    phoneme_language="en-us",
-    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    print_eval=True,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    data_dep_init_steps=1.0,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
+    config = GlowTTSConfig(
+        batch_size=2,
+        eval_batch_size=8,
+        num_loader_workers=0,
+        num_eval_loader_workers=0,
+        text_cleaner="english_cleaners",
+        use_phonemes=True,
+        phoneme_language="en-us",
+        phoneme_cache_path=tmp_path / "phoneme_cache",
+        run_eval=True,
+        test_delay_epochs=-1,
+        epochs=1,
+        print_step=1,
+        print_eval=True,
+        test_sentences=[
+            "Be a voice, not an echo.",
+        ],
+        data_dep_init_steps=1.0,
+    )
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
 
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
-    f"--coqpit.output_path {output_path} "
-    "--coqpit.datasets.0.formatter ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
-    "--coqpit.test_delay_epochs 0"
-)
-run_cli(command_train)
+    # train the model for one epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+        f"--coqpit.output_path {output_path} "
+        "--coqpit.datasets.0.formatter ljspeech "
+        "--coqpit.datasets.0.meta_file_train metadata.csv "
+        "--coqpit.datasets.0.meta_file_val metadata.csv "
+        "--coqpit.datasets.0.path tests/data/ljspeech "
+        "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+        "--coqpit.test_delay_epochs 0"
+    )
+    run_cli(command_train)
 
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+    # Find latest folder
+    continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime)
 
-# Inference using TTS API
-continue_config_path = os.path.join(continue_path, "config.json")
-continue_restore_path, _ = get_last_checkpoint(continue_path)
-out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+    # Inference using TTS API
+    continue_config_path = continue_path / "config.json"
+    continue_restore_path, _ = get_last_checkpoint(continue_path)
+    out_wav_path = tmp_path / "output.wav"
 
-# Check integrity of the config
-with open(continue_config_path, "r", encoding="utf-8") as f:
-    config_loaded = json.load(f)
-assert config_loaded["characters"] is not None
-assert config_loaded["output_path"] in continue_path
-assert config_loaded["test_delay_epochs"] == 0
+    # Check integrity of the config
+    with continue_config_path.open() as f:
+        config_loaded = json.load(f)
+    assert config_loaded["characters"] is not None
+    assert config_loaded["output_path"] in str(continue_path)
+    assert config_loaded["test_delay_epochs"] == 0
 
-# Load the model and run inference
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
-run_cli(inference_command)
+    # Load the model and run inference
+    inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+    run_cli(inference_command)
 
-# restore the model and continue training for one more epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
-run_cli(command_train)
-shutil.rmtree(continue_path)
+    # restore the model and continue training for one more epoch
+    command_train = (
+        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+    )
+    run_cli(command_train)
+    shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py
deleted file mode 100644
index 9d4e193382..0000000000
--- a/tests/vocoder_tests/test_fullband_melgan_train.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import FullbandMelganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = FullbandMelganConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=8192,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py
deleted file mode 100644
index c506fb48dc..0000000000
--- a/tests/vocoder_tests/test_hifigan_train.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import HifiganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = HifiganConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=1024,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py
deleted file mode 100644
index 6ef9cd495b..0000000000
--- a/tests/vocoder_tests/test_melgan_train.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import MelganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = MelganConfig(
-    batch_size=4,
-    eval_batch_size=4,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=2048,
-    eval_split_size=1,
-    print_step=1,
-    discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py
deleted file mode 100644
index 8002760706..0000000000
--- a/tests/vocoder_tests/test_multiband_melgan_train.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import MultibandMelganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = MultibandMelganConfig(
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=8192,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    steps_to_start_discriminator=1,
-    data_path="tests/data/ljspeech",
-    discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py
deleted file mode 100644
index a126befe2e..0000000000
--- a/tests/vocoder_tests/test_parallel_wavegan_train.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import ParallelWaveganConfig
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = ParallelWaveganConfig(
-    batch_size=4,
-    eval_batch_size=4,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=2048,
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_training.py b/tests/vocoder_tests/test_training.py
new file mode 100644
index 0000000000..8965de01ee
--- /dev/null
+++ b/tests/vocoder_tests/test_training.py
@@ -0,0 +1,112 @@
+import glob
+import os
+
+import pytest
+
+from tests import run_main
+from TTS.bin.train_vocoder import main
+from TTS.vocoder.configs import (
+    FullbandMelganConfig,
+    HifiganConfig,
+    MelganConfig,
+    MultibandMelganConfig,
+    ParallelWaveganConfig,
+    WavegradConfig,
+    WavernnConfig,
+)
+from TTS.vocoder.models.wavernn import WavernnArgs
+
+GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
+
+BASE_CONFIG = {
+    "batch_size": 8,
+    "eval_batch_size": 8,
+    "num_loader_workers": 0,
+    "num_eval_loader_workers": 0,
+    "run_eval": True,
+    "test_delay_epochs": -1,
+    "epochs": 1,
+    "seq_len": 8192,
+    "eval_split_size": 1,
+    "print_step": 1,
+    "print_eval": True,
+    "data_path": "tests/data/ljspeech",
+}
+
+DISCRIMINATOR_MODEL_PARAMS = {
+    "base_channels": 16,
+    "max_channels": 64,
+    "downsample_factors": [4, 4, 4],
+}
+
+
+def create_config(config_class, **overrides):
+    params = {**BASE_CONFIG, **overrides}
+    return config_class(**params)
+
+
+def run_train(tmp_path, config):
+    config_path = str(tmp_path / "test_vocoder_config.json")
+    output_path = tmp_path / "train_outputs"
+    config.output_path = output_path
+    config.audio.do_trim_silence = True
+    config.audio.trim_db = 60
+    config.save_json(config_path)
+
+    # Train the model for one epoch
+    run_main(main, ["--config_path", config_path])
+
+    # Find the latest folder
+    continue_path = str(max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime))
+
+    # Restore the model and continue training for one more epoch
+    run_main(main, ["--continue_path", continue_path])
+
+
+def test_train_hifigan(tmp_path):
+    config = create_config(HifiganConfig, seq_len=1024)
+    run_train(tmp_path, config)
+
+
+def test_train_melgan(tmp_path):
+    config = create_config(
+        MelganConfig,
+        batch_size=4,
+        eval_batch_size=4,
+        seq_len=2048,
+        discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS,
+    )
+    run_train(tmp_path, config)
+
+
+def test_train_multiband_melgan(tmp_path):
+    config = create_config(
+        MultibandMelganConfig, steps_to_start_discriminator=1, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS
+    )
+    run_train(tmp_path, config)
+
+
+def test_train_fullband_melgan(tmp_path):
+    config = create_config(FullbandMelganConfig, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS)
+    run_train(tmp_path, config)
+
+
+def test_train_parallel_wavegan(tmp_path):
+    config = create_config(ParallelWaveganConfig, batch_size=4, eval_batch_size=4, seq_len=2048)
+    run_train(tmp_path, config)
+
+
+# TODO: Reactivate after improving CI run times
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Takes ~2h on CI (15min/step vs 8sec/step locally)")
+def test_train_wavegrad(tmp_path):
+    config = create_config(WavegradConfig, test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2})
+    run_train(tmp_path, config)
+
+
+def test_train_wavernn(tmp_path):
+    config = create_config(
+        WavernnConfig,
+        model_args=WavernnArgs(),
+        seq_len=256,  # For shorter test time
+    )
+    run_train(tmp_path, config)
diff --git a/tests/vocoder_tests/test_vocoder_gan_datasets.py b/tests/vocoder_tests/test_vocoder_gan_datasets.py
index c39d70e94c..d540667ee8 100644
--- a/tests/vocoder_tests/test_vocoder_gan_datasets.py
+++ b/tests/vocoder_tests/test_vocoder_gan_datasets.py
@@ -3,16 +3,12 @@
 import numpy as np
 from torch.utils.data import DataLoader
 
-from tests import get_tests_output_path, get_tests_path
+from tests import get_tests_path
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.configs import BaseGANVocoderConfig
 from TTS.vocoder.datasets.gan_dataset import GANDataset
 from TTS.vocoder.datasets.preprocess import load_wav_data
 
-file_path = os.path.dirname(os.path.realpath(__file__))
-OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
-os.makedirs(OUTPATH, exist_ok=True)
-
 C = BaseGANVocoderConfig()
 
 test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
diff --git a/tests/vocoder_tests/test_vocoder_losses.py b/tests/vocoder_tests/test_vocoder_losses.py
index 95501c2d39..c9432d7f4b 100644
--- a/tests/vocoder_tests/test_vocoder_losses.py
+++ b/tests/vocoder_tests/test_vocoder_losses.py
@@ -2,17 +2,12 @@
 
 import torch
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+from tests import get_tests_input_path
 from TTS.config import BaseAudioConfig
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.audio.numpy_transforms import stft
 from TTS.vocoder.layers.losses import MelganFeatureLoss, MultiScaleSTFTLoss, STFTLoss, TorchSTFT
 
-TESTS_PATH = get_tests_path()
-
-OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
-os.makedirs(OUT_PATH, exist_ok=True)
-
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
 ap = AudioProcessor(**BaseAudioConfig().to_dict())
diff --git a/tests/vocoder_tests/test_vocoder_pqmf.py b/tests/vocoder_tests/test_vocoder_pqmf.py
index afe8d1dc8f..9be492927d 100644
--- a/tests/vocoder_tests/test_vocoder_pqmf.py
+++ b/tests/vocoder_tests/test_vocoder_pqmf.py
@@ -4,14 +4,13 @@
 import torch
 from librosa.core import load
 
-from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+from tests import get_tests_input_path
 from TTS.vocoder.layers.pqmf import PQMF
 
-TESTS_PATH = get_tests_path()
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
 
-def test_pqmf():
+def test_pqmf(tmp_path):
     w, sr = load(WAV_FILE)
 
     layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0)
@@ -23,4 +22,4 @@ def test_pqmf():
     print(w2_.max())
     print(w2_.min())
     print(w2_.mean())
-    sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr)
+    sf.write(tmp_path / "pqmf_output.wav", w2_.flatten().detach(), sr)
diff --git a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py
index 503b4e2483..c3ae1309dc 100644
--- a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py
+++ b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py
@@ -1,29 +1,38 @@
 import os
-import shutil
 
 import numpy as np
+import pytest
 from torch.utils.data import DataLoader
 
-from tests import get_tests_output_path, get_tests_path
+from tests import get_tests_path
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.configs import WavernnConfig
 from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files
 from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
 
-file_path = os.path.dirname(os.path.realpath(__file__))
-OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
-os.makedirs(OUTPATH, exist_ok=True)
-
 C = WavernnConfig()
 
 test_data_path = os.path.join(get_tests_path(), "data/ljspeech/")
-test_mel_feat_path = os.path.join(test_data_path, "mel")
-test_quant_feat_path = os.path.join(test_data_path, "quant")
-ok_ljspeech = os.path.exists(test_data_path)
 
+params = [
+    [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0],
+    [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4],
+    [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0],
+    [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0],
+    [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0],
+    [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2],
+    [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0],
+]
+
+
+@pytest.mark.parametrize("params", params)
+def test_parametrized_wavernn_dataset(tmp_path, params):
+    """Run dataloader with given parameters and check conditions"""
+    print(params)
+    batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers = params
+    test_mel_feat_path = tmp_path / "mel"
+    test_quant_feat_path = tmp_path / "quant"
 
-def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers):
-    """run dataloader with given parameters and check conditions"""
     ap = AudioProcessor(**C.audio)
 
     C.batch_size = batch_size
@@ -31,7 +40,7 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor
     C.seq_len = seq_len
     C.data_path = test_data_path
 
-    preprocess_wav_files(test_data_path, C, ap)
+    preprocess_wav_files(tmp_path, C, ap)
     _, train_items = load_wav_feat_data(test_data_path, test_mel_feat_path, 5)
 
     dataset = WaveRNNDataset(
@@ -50,35 +59,12 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor
     max_iter = 10
     count_iter = 0
 
-    try:
-        for data in loader:
-            x_input, mels, _ = data
-            expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2))
-            assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}"
-
-            assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1]
-            count_iter += 1
-            if count_iter == max_iter:
-                break
-    # except AssertionError:
-    #     shutil.rmtree(test_mel_feat_path)
-    #     shutil.rmtree(test_quant_feat_path)
-    finally:
-        shutil.rmtree(test_mel_feat_path)
-        shutil.rmtree(test_quant_feat_path)
-
+    for data in loader:
+        x_input, mels, _ = data
+        expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2))
+        assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}"
 
-def test_parametrized_wavernn_dataset():
-    """test dataloader with different parameters"""
-    params = [
-        [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0],
-        [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4],
-        [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0],
-        [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0],
-        [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0],
-        [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2],
-        [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0],
-    ]
-    for param in params:
-        print(param)
-        wavernn_dataset_case(*param)
+        assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1]
+        count_iter += 1
+        if count_iter == max_iter:
+            break
diff --git a/tests/vocoder_tests/test_wavegrad.py b/tests/vocoder_tests/test_wavegrad.py
index 43b5f08042..7530bec426 100644
--- a/tests/vocoder_tests/test_wavegrad.py
+++ b/tests/vocoder_tests/test_wavegrad.py
@@ -1,5 +1,3 @@
-import unittest
-
 import numpy as np
 import torch
 from torch import optim
@@ -10,50 +8,45 @@
 # pylint: disable=unused-variable
 
 torch.manual_seed(1)
-use_cuda = torch.cuda.is_available()
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 
-class WavegradTrainTest(unittest.TestCase):
-    def test_train_step(self):  # pylint: disable=no-self-use
-        """Test if all layers are updated in a basic training cycle"""
-        input_dummy = torch.rand(8, 1, 20 * 300).to(device)
-        mel_spec = torch.rand(8, 80, 20).to(device)
-
-        criterion = torch.nn.L1Loss().to(device)
-        args = WavegradArgs(
-            in_channels=80,
-            out_channels=1,
-            upsample_factors=[5, 5, 3, 2, 2],
-            upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
+def test_train_step():
+    """Test if all layers are updated in a basic training cycle"""
+    torch.set_grad_enabled(True)
+    input_dummy = torch.rand(8, 1, 20 * 300).to(device)
+    mel_spec = torch.rand(8, 80, 20).to(device)
+
+    criterion = torch.nn.L1Loss().to(device)
+    args = WavegradArgs(
+        in_channels=80,
+        out_channels=1,
+        upsample_factors=[5, 5, 3, 2, 2],
+        upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
+    )
+    config = WavegradConfig(model_params=args)
+    model = Wavegrad(config)
+
+    model_ref = Wavegrad(config)
+    model.train()
+    model.to(device)
+    betas = np.linspace(1e-6, 1e-2, 1000)
+    model.compute_noise_level(betas)
+    model_ref.load_state_dict(model.state_dict())
+    model_ref.to(device)
+    for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+        assert (param - param_ref).sum() == 0, param
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+    for _ in range(5):
+        y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device))
+        optimizer.zero_grad()
+        loss = criterion(y_hat, input_dummy)
+        loss.backward()
+        optimizer.step()
+    # check parameter changes
+    for i, (param, param_ref) in enumerate(zip(model.parameters(), model_ref.parameters())):
+        # ignore pre-higway layer since it works conditional
+        # if count not in [145, 59]:
+        assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
+            i, param.shape, param, param_ref
         )
-        config = WavegradConfig(model_params=args)
-        model = Wavegrad(config)
-
-        model_ref = Wavegrad(config)
-        model.train()
-        model.to(device)
-        betas = np.linspace(1e-6, 1e-2, 1000)
-        model.compute_noise_level(betas)
-        model_ref.load_state_dict(model.state_dict())
-        model_ref.to(device)
-        count = 0
-        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
-            assert (param - param_ref).sum() == 0, param
-            count += 1
-        optimizer = optim.Adam(model.parameters(), lr=0.001)
-        for i in range(5):
-            y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device))
-            optimizer.zero_grad()
-            loss = criterion(y_hat, input_dummy)
-            loss.backward()
-            optimizer.step()
-        # check parameter changes
-        count = 0
-        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
-            # ignore pre-higway layer since it works conditional
-            # if count not in [145, 59]:
-            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
-                count, param.shape, param, param_ref
-            )
-            count += 1
diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py
deleted file mode 100644
index 9b10759505..0000000000
--- a/tests/vocoder_tests/test_wavegrad_train.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import glob
-import os
-import shutil
-import unittest
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import WavegradConfig
-
-
-class WavegradTrainingTest(unittest.TestCase):
-    # TODO: Reactivate after improving CI run times
-    # This test currently takes ~2h on CI (15min/step vs 8sec/step locally)
-    if os.getenv("GITHUB_ACTIONS") == "true":
-        __test__ = False
-
-    def test_train(self):  # pylint: disable=no-self-use
-        config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-        output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-        config = WavegradConfig(
-            batch_size=8,
-            eval_batch_size=8,
-            num_loader_workers=0,
-            num_eval_loader_workers=0,
-            run_eval=True,
-            test_delay_epochs=-1,
-            epochs=1,
-            seq_len=8192,
-            eval_split_size=1,
-            print_step=1,
-            print_eval=True,
-            data_path="tests/data/ljspeech",
-            output_path=output_path,
-            test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2},
-        )
-        config.audio.do_trim_silence = True
-        config.audio.trim_db = 60
-        config.save_json(config_path)
-
-        # train the model for one epoch
-        command_train = (
-            f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-        )
-        run_cli(command_train)
-
-        # Find latest folder
-        continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-        # restore the model and continue training for one more epoch
-        command_train = (
-            f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-        )
-        run_cli(command_train)
-        shutil.rmtree(continue_path)
diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py
deleted file mode 100644
index 337e24259f..0000000000
--- a/tests/vocoder_tests/test_wavernn_train.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.vocoder.configs import WavernnConfig
-from TTS.vocoder.models.wavernn import WavernnArgs
-
-config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-
-config = WavernnConfig(
-    model_args=WavernnArgs(),
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    seq_len=256,  # for shorter test time
-    eval_split_size=1,
-    print_step=1,
-    print_eval=True,
-    data_path="tests/data/ljspeech",
-    output_path=output_path,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/xtts_tests/test_xtts_gpt_train.py b/tests/xtts_tests/test_xtts_gpt_train.py
index bb592f1f2d..4d22b8102f 100644
--- a/tests/xtts_tests/test_xtts_gpt_train.py
+++ b/tests/xtts_tests/test_xtts_gpt_train.py
@@ -1,10 +1,9 @@
-import os
-import shutil
+from pathlib import Path
 
+import pytest
 import torch
 from trainer import Trainer, TrainerArgs
 
-from tests import get_tests_output_path
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.layers.xtts.dvae import DiscreteVAE
@@ -28,37 +27,9 @@
 DASHBOARD_LOGGER = "tensorboard"
 LOGGER_URI = None
 
-# Set here the path that the checkpoints will be saved. Default: ./run/training/
-OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests")
-os.makedirs(OUT_PATH, exist_ok=True)
-
-# Create DVAE checkpoint and mel_norms on test time
-# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model
-DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth")  # DVAE checkpoint
-MEL_NORM_FILE = os.path.join(
-    OUT_PATH, "mel_stats.pth"
-)  # Mel spectrogram norms, required for dvae mel spectrogram extraction
-dvae = DiscreteVAE(
-    channels=80,
-    normalization=None,
-    positional_dims=1,
-    num_tokens=8192,
-    codebook_dim=512,
-    hidden_dim=512,
-    num_resnet_blocks=3,
-    kernel_size=3,
-    num_layers=2,
-    use_transposed_convs=False,
-)
-torch.save(dvae.state_dict(), DVAE_CHECKPOINT)
-mel_stats = torch.ones(80)
-torch.save(mel_stats, MEL_NORM_FILE)
-
-
 # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
 TOKENIZER_FILE = "tests/inputs/xtts_vocab.json"  # vocab.json file
-XTTS_CHECKPOINT = None  # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth"  # model.pth file
-
+XTTS_CHECKPOINT = None  # model.pth file
 
 # Training sentences generations
 SPEAKER_REFERENCE = [
@@ -66,99 +37,122 @@
 ]  # speaker reference to be used in training test sentences
 LANGUAGE = config_dataset.language
 
-
 # Training Parameters
 OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
 START_WITH_EVAL = False  # if True it will star with evaluation
 BATCH_SIZE = 2  # set here the batch size
 GRAD_ACUMM_STEPS = 1  # set here the grad accumulation steps
-# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
-
-
-# init args and config
-model_args = GPTArgs(
-    max_conditioning_length=132300,  # 6 secs
-    min_conditioning_length=66150,  # 3 secs
-    debug_loading_failures=False,
-    max_wav_length=255995,  # ~11.6 seconds
-    max_text_length=200,
-    mel_norm_file=MEL_NORM_FILE,
-    dvae_checkpoint=DVAE_CHECKPOINT,
-    xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
-    tokenizer_file=TOKENIZER_FILE,
-    gpt_num_audio_tokens=8194,
-    gpt_start_audio_token=8192,
-    gpt_stop_audio_token=8193,
-)
-audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
-config = GPTTrainerConfig(
-    epochs=1,
-    output_path=OUT_PATH,
-    model_args=model_args,
-    run_name=RUN_NAME,
-    project_name=PROJECT_NAME,
-    run_description="""
-        GPT XTTS training
-        """,
-    dashboard_logger=DASHBOARD_LOGGER,
-    logger_uri=LOGGER_URI,
-    audio=audio_config,
-    batch_size=BATCH_SIZE,
-    batch_group_size=48,
-    eval_batch_size=BATCH_SIZE,
-    num_loader_workers=8,
-    eval_split_max_size=256,
-    print_step=50,
-    plot_step=100,
-    log_model_step=1000,
-    save_step=10000,
-    save_n_checkpoints=1,
-    save_checkpoints=True,
-    # target_loss="loss",
-    print_eval=False,
-    # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
-    optimizer="AdamW",
-    optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
-    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
-    lr=5e-06,  # learning rate
-    lr_scheduler="MultiStepLR",
-    # it was adjusted accordly for the new step scheme
-    lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
-    test_sentences=[
-        {
-            "text": "This cake is great. It's so delicious and moist.",
-            "speaker_wav": SPEAKER_REFERENCE,
-            "language": LANGUAGE,
-        },
-    ],
-)
-
-# init the model from config
-model = GPTTrainer.init_from_config(config)
+# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252
+# for more efficient training. You can increase/decrease BATCH_SIZE but then set
+# GRAD_ACUMM_STEPS accordingly.
 
-# load training samples
-train_samples, eval_samples = load_tts_samples(
-    DATASETS_CONFIG_LIST,
-    eval_split=True,
-    eval_split_max_size=config.eval_split_max_size,
-    eval_split_size=config.eval_split_size,
-)
+audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
 
-# init the trainer and 🚀
-trainer = Trainer(
-    TrainerArgs(
-        restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
-        skip_train_epoch=False,
-        start_with_eval=True,
-        grad_accum_steps=GRAD_ACUMM_STEPS,
-    ),
-    config,
-    output_path=OUT_PATH,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-)
-trainer.fit()
 
-# remove output path
-shutil.rmtree(OUT_PATH)
+@pytest.mark.parametrize("use_perceiver", [False, True])
+def test_xtts_gpt_train(tmp_path: Path, use_perceiver: bool):
+    # Create DVAE checkpoint and mel_norms on test time
+    # DVAE parameters: For the training we need the dvae to extract the dvae tokens,
+    #                  given that you must provide the paths for this model
+    DVAE_CHECKPOINT = tmp_path / "dvae.pth"
+    # Mel spectrogram norms for dvae mel spectrogram extraction
+    MEL_NORM_FILE = tmp_path / "mel_stats.pth"
+    dvae = DiscreteVAE(
+        channels=80,
+        normalization=None,
+        positional_dims=1,
+        num_tokens=8192,
+        codebook_dim=512,
+        hidden_dim=512,
+        num_resnet_blocks=3,
+        kernel_size=3,
+        num_layers=2,
+        use_transposed_convs=False,
+    )
+    torch.save(dvae.state_dict(), DVAE_CHECKPOINT)
+    mel_stats = torch.ones(80)
+    torch.save(mel_stats, MEL_NORM_FILE)
+
+    # init args and config
+    model_args = GPTArgs(
+        max_conditioning_length=132300,  # 6 secs
+        min_conditioning_length=66150,  # 3 secs
+        debug_loading_failures=False,
+        max_wav_length=255995,  # ~11.6 seconds
+        max_text_length=200,
+        mel_norm_file=MEL_NORM_FILE,
+        dvae_checkpoint=DVAE_CHECKPOINT,
+        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
+        tokenizer_file=TOKENIZER_FILE,
+        gpt_num_audio_tokens=8194,
+        gpt_start_audio_token=8192,
+        gpt_stop_audio_token=8193,
+        gpt_use_perceiver_resampler=use_perceiver,
+    )
+
+    config = GPTTrainerConfig(
+        epochs=1,
+        output_path=tmp_path,
+        model_args=model_args,
+        run_name=RUN_NAME,
+        project_name=PROJECT_NAME,
+        run_description="GPT XTTS training",
+        dashboard_logger=DASHBOARD_LOGGER,
+        logger_uri=LOGGER_URI,
+        audio=audio_config,
+        batch_size=BATCH_SIZE,
+        batch_group_size=48,
+        eval_batch_size=BATCH_SIZE,
+        num_loader_workers=8,
+        eval_split_max_size=256,
+        print_step=50,
+        plot_step=100,
+        log_model_step=1000,
+        save_step=10000,
+        save_n_checkpoints=1,
+        save_checkpoints=True,
+        # target_loss="loss",
+        print_eval=False,
+        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
+        optimizer="AdamW",
+        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
+        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
+        lr=5e-06,  # learning rate
+        lr_scheduler="MultiStepLR",
+        # it was adjusted accordly for the new step scheme
+        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
+        test_sentences=[
+            {
+                "text": "This cake is great. It's so delicious and moist.",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+        ],
+    )
+
+    # init the model from config
+    model = GPTTrainer.init_from_config(config)
+
+    # load training samples
+    train_samples, eval_samples = load_tts_samples(
+        DATASETS_CONFIG_LIST,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
+    )
+
+    # init the trainer and 🚀
+    trainer = Trainer(
+        TrainerArgs(
+            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
+            skip_train_epoch=False,
+            start_with_eval=True,
+            grad_accum_steps=GRAD_ACUMM_STEPS,
+        ),
+        config,
+        output_path=tmp_path,
+        model=model,
+        train_samples=train_samples,
+        eval_samples=eval_samples,
+    )
+    trainer.fit()
diff --git a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py b/tests/xtts_tests/test_xtts_v2-0_gpt_train.py
deleted file mode 100644
index 454e867385..0000000000
--- a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import os
-import shutil
-
-import torch
-from trainer import Trainer, TrainerArgs
-
-from tests import get_tests_output_path
-from TTS.config.shared_configs import BaseDatasetConfig
-from TTS.tts.datasets import load_tts_samples
-from TTS.tts.layers.xtts.dvae import DiscreteVAE
-from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
-from TTS.tts.models.xtts import XttsAudioConfig
-
-config_dataset = BaseDatasetConfig(
-    formatter="ljspeech",
-    dataset_name="ljspeech",
-    path="tests/data/ljspeech/",
-    meta_file_train="metadata.csv",
-    meta_file_val="metadata.csv",
-    language="en",
-)
-
-DATASETS_CONFIG_LIST = [config_dataset]
-
-# Logging parameters
-RUN_NAME = "GPT_XTTS_LJSpeech_FT"
-PROJECT_NAME = "XTTS_trainer"
-DASHBOARD_LOGGER = "tensorboard"
-LOGGER_URI = None
-
-OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests")
-os.makedirs(OUT_PATH, exist_ok=True)
-
-# Create DVAE checkpoint and mel_norms on test time
-# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model
-DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth")  # DVAE checkpoint
-# Mel spectrogram norms, required for dvae mel spectrogram extraction
-MEL_NORM_FILE = os.path.join(OUT_PATH, "mel_stats.pth")
-dvae = DiscreteVAE(
-    channels=80,
-    normalization=None,
-    positional_dims=1,
-    num_tokens=8192,
-    codebook_dim=512,
-    hidden_dim=512,
-    num_resnet_blocks=3,
-    kernel_size=3,
-    num_layers=2,
-    use_transposed_convs=False,
-)
-torch.save(dvae.state_dict(), DVAE_CHECKPOINT)
-mel_stats = torch.ones(80)
-torch.save(mel_stats, MEL_NORM_FILE)
-
-
-# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
-TOKENIZER_FILE = "tests/inputs/xtts_vocab.json"  # vocab.json file
-XTTS_CHECKPOINT = None  # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth"  # model.pth file
-
-
-# Training sentences generations
-SPEAKER_REFERENCE = [
-    "tests/data/ljspeech/wavs/LJ001-0002.wav"
-]  # speaker reference to be used in training test sentences
-LANGUAGE = config_dataset.language
-
-
-# Training Parameters
-OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
-START_WITH_EVAL = False  # if True it will star with evaluation
-BATCH_SIZE = 2  # set here the batch size
-GRAD_ACUMM_STEPS = 1  # set here the grad accumulation steps
-# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
-
-
-# init args and config
-model_args = GPTArgs(
-    max_conditioning_length=132300,  # 6 secs
-    min_conditioning_length=66150,  # 3 secs
-    debug_loading_failures=False,
-    max_wav_length=255995,  # ~11.6 seconds
-    max_text_length=200,
-    mel_norm_file=MEL_NORM_FILE,
-    dvae_checkpoint=DVAE_CHECKPOINT,
-    xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
-    tokenizer_file=TOKENIZER_FILE,
-    gpt_num_audio_tokens=8194,
-    gpt_start_audio_token=8192,
-    gpt_stop_audio_token=8193,
-    gpt_use_masking_gt_prompt_approach=True,
-    gpt_use_perceiver_resampler=True,
-)
-
-audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
-
-config = GPTTrainerConfig(
-    epochs=1,
-    output_path=OUT_PATH,
-    model_args=model_args,
-    run_name=RUN_NAME,
-    project_name=PROJECT_NAME,
-    run_description="GPT XTTS training",
-    dashboard_logger=DASHBOARD_LOGGER,
-    logger_uri=LOGGER_URI,
-    audio=audio_config,
-    batch_size=BATCH_SIZE,
-    batch_group_size=48,
-    eval_batch_size=BATCH_SIZE,
-    num_loader_workers=8,
-    eval_split_max_size=256,
-    print_step=50,
-    plot_step=100,
-    log_model_step=1000,
-    save_step=10000,
-    save_n_checkpoints=1,
-    save_checkpoints=True,
-    # target_loss="loss",
-    print_eval=False,
-    # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
-    optimizer="AdamW",
-    optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
-    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
-    lr=5e-06,  # learning rate
-    lr_scheduler="MultiStepLR",
-    # it was adjusted accordly for the new step scheme
-    lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
-    test_sentences=[
-        {
-            "text": "This cake is great. It's so delicious and moist.",
-            "speaker_wav": SPEAKER_REFERENCE,
-            "language": LANGUAGE,
-        },
-    ],
-)
-
-# init the model from config
-model = GPTTrainer.init_from_config(config)
-
-# load training samples
-train_samples, eval_samples = load_tts_samples(
-    DATASETS_CONFIG_LIST,
-    eval_split=True,
-    eval_split_max_size=config.eval_split_max_size,
-    eval_split_size=config.eval_split_size,
-)
-
-# init the trainer and 🚀
-trainer = Trainer(
-    TrainerArgs(
-        restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
-        skip_train_epoch=False,
-        start_with_eval=True,
-        grad_accum_steps=GRAD_ACUMM_STEPS,
-    ),
-    config,
-    output_path=OUT_PATH,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-)
-trainer.fit()
-
-# remove output path
-shutil.rmtree(OUT_PATH)
diff --git a/tests/zoo_tests/test_big_models.py b/tests/zoo_tests/test_big_models.py
new file mode 100644
index 0000000000..8a9780b4f0
--- /dev/null
+++ b/tests/zoo_tests/test_big_models.py
@@ -0,0 +1,193 @@
+"""These tests should be run locally because the models are too big for CI."""
+
+import os
+
+import pytest
+import torch
+
+from tests import get_tests_data_path, run_main
+from TTS.bin.synthesize import main
+from TTS.utils.manage import ModelManager
+
+GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
+
+
+@pytest.fixture(scope="session", autouse=True)
+def set_env():
+    os.environ["COQUI_TOS_AGREED"] = "1"
+
+
+@pytest.fixture
+def manager():
+    """Set up model manager."""
+    return ModelManager(progress_bar=False)
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_xtts(tmp_path):
+    """XTTS is too big to run on github actions. We need to test it locally"""
+    args = [
+        "--model_name",
+        "tts_models/multilingual/multi-dataset/xtts_v1.1",
+        "--text",
+        "C'est un exemple.",
+        "--language_idx",
+        "fr",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--no-progress_bar",
+        "--speaker_wav",
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"),
+    ]
+    if torch.cuda.is_available():
+        args.append("--use_cuda")
+    run_main(main, args)
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_xtts_streaming(manager):
+    """Testing the new inference_stream method"""
+    from TTS.tts.configs.xtts_config import XttsConfig
+    from TTS.tts.models.xtts import Xtts
+
+    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
+    speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
+    speaker_wav.append(speaker_wav_2)
+    model_path, _, _ = manager.download_model("tts_models/multilingual/multi-dataset/xtts_v1.1")
+    config = XttsConfig()
+    config.load_json(model_path / "config.json")
+    model = Xtts.init_from_config(config)
+    model.load_checkpoint(config, checkpoint_dir=str(model_path))
+    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+
+    print("Computing speaker latents...")
+    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
+
+    print("Inference...")
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+    )
+    wav_chunks = []
+    for i, chunk in enumerate(chunks):
+        if i == 0:
+            assert chunk.shape[-1] > 5000
+        wav_chunks.append(chunk)
+    assert len(wav_chunks) > 1
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_xtts_v2(tmp_path):
+    """XTTS is too big to run on github actions. We need to test it locally"""
+    args = [
+        "--model_name",
+        "tts_models/multilingual/multi-dataset/xtts_v2",
+        "--text",
+        "C'est un exemple.",
+        "--language_idx",
+        "fr",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--no-progress_bar",
+        "--speaker_wav",
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"),
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav"),
+    ]
+    if torch.cuda.is_available():
+        args.append("--use_cuda")
+    run_main(main, args)
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_xtts_v2_streaming(manager):
+    """Testing the new inference_stream method"""
+    from TTS.tts.configs.xtts_config import XttsConfig
+    from TTS.tts.models.xtts import Xtts
+
+    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
+    model_path, _, _ = manager.download_model("tts_models/multilingual/multi-dataset/xtts_v2")
+    config = XttsConfig()
+    config.load_json(model_path / "config.json")
+    model = Xtts.init_from_config(config)
+    model.load_checkpoint(config, checkpoint_dir=str(model_path))
+    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+
+    print("Computing speaker latents...")
+    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
+
+    print("Inference...")
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+    )
+    wav_chunks = []
+    for i, chunk in enumerate(chunks):
+        if i == 0:
+            assert chunk.shape[-1] > 5000
+        wav_chunks.append(chunk)
+    assert len(wav_chunks) > 1
+    normal_len = sum([len(chunk) for chunk in wav_chunks])
+
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+        speed=1.5,
+    )
+    wav_chunks = []
+    for i, chunk in enumerate(chunks):
+        wav_chunks.append(chunk)
+    fast_len = sum([len(chunk) for chunk in wav_chunks])
+
+    chunks = model.inference_stream(
+        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
+        "en",
+        gpt_cond_latent,
+        speaker_embedding,
+        speed=0.66,
+    )
+    wav_chunks = []
+    for i, chunk in enumerate(chunks):
+        wav_chunks.append(chunk)
+    slow_len = sum([len(chunk) for chunk in wav_chunks])
+
+    assert slow_len > normal_len
+    assert normal_len > fast_len
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_tortoise(tmp_path):
+    args = [
+        "--model_name",
+        "tts_models/en/multi-dataset/tortoise-v2",
+        "--text",
+        "This is an example.",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--no-progress_bar",
+    ]
+    if torch.cuda.is_available():
+        args.append("--use_cuda")
+    run_main(main, args)
+
+
+@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI")
+def test_bark(tmp_path):
+    """Bark is too big to run on github actions. We need to test it locally"""
+    args = [
+        "--model_name",
+        "tts_models/multilingual/multi-dataset/bark",
+        "--text",
+        "This is an example.",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--no-progress_bar",
+    ]
+    if torch.cuda.is_available():
+        args.append("--use_cuda")
+    run_main(main, args)
diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py
index f38880b51f..b7c88e0730 100644
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@@ -1,12 +1,12 @@
 #!/usr/bin/env python3`
-import glob
 import os
 import shutil
 
-import torch
-from trainer.io import get_user_data_dir
+import pytest
 
-from tests import get_tests_data_path, get_tests_output_path, run_cli
+from tests import get_tests_data_path, run_main
+from TTS.api import TTS
+from TTS.bin.synthesize import main
 from TTS.tts.utils.languages import LanguageManager
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.manage import ModelManager
@@ -19,249 +19,80 @@
 ]
 
 
-def run_models(offset=0, step=1):
-    """Check if all the models are downloadable and tts models run correctly."""
-    print(" > Run synthesizer with all the models.")
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False)
-    model_names = [name for name in manager.list_models() if name not in MODELS_WITH_SEP_TESTS]
-    print("Model names:", model_names)
-    for model_name in model_names[offset::step]:
-        print(f"\n > Run - {model_name}")
-        model_path, _, _ = manager.download_model(model_name)
-        if "tts_models" in model_name:
-            local_download_dir = os.path.dirname(model_path)
-            # download and run the model
-            speaker_files = glob.glob(local_download_dir + "/speaker*")
-            language_files = glob.glob(local_download_dir + "/language*")
-            speaker_arg = ""
-            language_arg = ""
-            if len(speaker_files) > 0:
-                # multi-speaker model
-                if "speaker_ids" in speaker_files[0]:
-                    speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0])
-                elif "speakers" in speaker_files[0]:
-                    speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0])
-                speakers = list(speaker_manager.name_to_id.keys())
-                if len(speakers) > 1:
-                    speaker_arg = f'--speaker_idx "{speakers[0]}"'
-            if len(language_files) > 0 and "language_ids" in language_files[0]:
-                # multi-lingual model
-                language_manager = LanguageManager(language_ids_file_path=language_files[0])
-                languages = language_manager.language_names
-                if len(languages) > 1:
-                    language_arg = f'--language_idx "{languages[0]}"'
-            run_cli(
-                f'tts --model_name  {model_name} --text "This is an example." '
-                f'--out_path "{output_path}" {speaker_arg} {language_arg} --no-progress_bar'
-            )
-            # remove downloaded models
-            shutil.rmtree(local_download_dir)
-            shutil.rmtree(get_user_data_dir("tts"))
-        elif "voice_conversion_models" in model_name:
-            speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
-            reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
-            run_cli(
-                f"tts --model_name  {model_name} "
-                f'--out_path "{output_path}" --source_wav "{speaker_wav}" --target_wav "{reference_wav}" --no-progress_bar'
-            )
-        else:
-            # only download the model
-            manager.download_model(model_name)
-        print(f" | > OK: {model_name}")
-
-
-def test_xtts():
-    """XTTS is too big to run on github actions. We need to test it locally"""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
-    use_gpu = torch.cuda.is_available()
-    if use_gpu:
-        run_cli(
-            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1.1 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda '
-            f'--speaker_wav "{speaker_wav}" --language_idx "en"'
-        )
+@pytest.fixture(autouse=True)
+def run_around_tests(tmp_path):
+    """Download models to a temp folder and delete it afterwards."""
+    os.environ["TTS_HOME"] = str(tmp_path)
+    yield
+    shutil.rmtree(tmp_path)
+
+
+@pytest.fixture
+def manager(tmp_path):
+    """Set up model manager."""
+    return ModelManager(output_prefix=tmp_path, progress_bar=False)
+
+
+# To split tests into different CI jobs
+num_partitions = int(os.getenv("NUM_PARTITIONS", "1"))
+partition = int(os.getenv("TEST_PARTITION", "0"))
+model_names = [name for name in TTS.list_models() if name not in MODELS_WITH_SEP_TESTS]
+model_names = [name for i, name in enumerate(model_names) if i % num_partitions == partition]
+
+
+@pytest.mark.parametrize("model_name", model_names)
+def test_models(tmp_path, model_name, manager):
+    print(f"\n > Run - {model_name}")
+    output_path = str(tmp_path / "output.wav")
+    model_path, _, _ = manager.download_model(model_name)
+    args = ["--model_name", model_name, "--out_path", output_path, "--no-progress_bar"]
+    if "tts_models" in model_name:
+        local_download_dir = model_path.parent
+        # download and run the model
+        speaker_files = list(local_download_dir.glob("speaker*"))
+        language_files = list(local_download_dir.glob("language*"))
+        speaker_arg = []
+        language_arg = []
+        if len(speaker_files) > 0:
+            # multi-speaker model
+            if "speaker_ids" in speaker_files[0].stem:
+                speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0])
+            elif "speakers" in speaker_files[0].stem:
+                speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0])
+            speakers = list(speaker_manager.name_to_id.keys())
+            if len(speakers) > 1:
+                speaker_arg = ["--speaker_idx", speakers[0]]
+        if len(language_files) > 0 and "language_ids" in language_files[0].stem:
+            # multi-lingual model
+            language_manager = LanguageManager(language_ids_file_path=language_files[0])
+            languages = language_manager.language_names
+            if len(languages) > 1:
+                language_arg = ["--language_idx", languages[0]]
+        run_main(main, [*args, "--text", "This is an example.", *speaker_arg, *language_arg])
+    elif "voice_conversion_models" in model_name:
+        speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
+        reference_wav1 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0028.wav")
+        reference_wav2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
+        run_main(main, [*args, "--source_wav", speaker_wav, "--target_wav", reference_wav1, reference_wav2])
     else:
-        run_cli(
-            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1.1 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar '
-            f'--speaker_wav "{speaker_wav}" --language_idx "en"'
-        )
-
-
-def test_xtts_streaming():
-    """Testing the new inference_stream method"""
-    from TTS.tts.configs.xtts_config import XttsConfig
-    from TTS.tts.models.xtts import Xtts
-
-    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
-    speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
-    speaker_wav.append(speaker_wav_2)
-    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1")
-    config = XttsConfig()
-    config.load_json(os.path.join(model_path, "config.json"))
-    model = Xtts.init_from_config(config)
-    model.load_checkpoint(config, checkpoint_dir=model_path)
-    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+        # only download the model
+        manager.download_model(model_name)
+    print(f" | > OK: {model_name}")
 
-    print("Computing speaker latents...")
-    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
-
-    print("Inference...")
-    chunks = model.inference_stream(
-        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
-        "en",
-        gpt_cond_latent,
-        speaker_embedding,
-    )
-    wav_chuncks = []
-    for i, chunk in enumerate(chunks):
-        if i == 0:
-            assert chunk.shape[-1] > 5000
-        wav_chuncks.append(chunk)
-    assert len(wav_chuncks) > 1
-
-
-def test_xtts_v2():
-    """XTTS is too big to run on github actions. We need to test it locally"""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
-    speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
-    use_gpu = torch.cuda.is_available()
-    if use_gpu:
-        run_cli(
-            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v2 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda '
-            f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}"  --language_idx "en"'
-        )
-    else:
-        run_cli(
-            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v2 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar '
-            f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"'
-        )
 
-
-def test_xtts_v2_streaming():
-    """Testing the new inference_stream method"""
-    from TTS.tts.configs.xtts_config import XttsConfig
-    from TTS.tts.models.xtts import Xtts
-
-    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
-    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
-    config = XttsConfig()
-    config.load_json(os.path.join(model_path, "config.json"))
-    model = Xtts.init_from_config(config)
-    model.load_checkpoint(config, checkpoint_dir=model_path)
-    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-
-    print("Computing speaker latents...")
-    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
-
-    print("Inference...")
-    chunks = model.inference_stream(
-        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
-        "en",
-        gpt_cond_latent,
-        speaker_embedding,
-    )
-    wav_chuncks = []
-    for i, chunk in enumerate(chunks):
-        if i == 0:
-            assert chunk.shape[-1] > 5000
-        wav_chuncks.append(chunk)
-    assert len(wav_chuncks) > 1
-    normal_len = sum([len(chunk) for chunk in wav_chuncks])
-
-    chunks = model.inference_stream(
-        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
-        "en",
-        gpt_cond_latent,
-        speaker_embedding,
-        speed=1.5,
-    )
-    wav_chuncks = []
-    for i, chunk in enumerate(chunks):
-        wav_chuncks.append(chunk)
-    fast_len = sum([len(chunk) for chunk in wav_chuncks])
-
-    chunks = model.inference_stream(
-        "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
-        "en",
-        gpt_cond_latent,
-        speaker_embedding,
-        speed=0.66,
-    )
-    wav_chuncks = []
-    for i, chunk in enumerate(chunks):
-        wav_chuncks.append(chunk)
-    slow_len = sum([len(chunk) for chunk in wav_chuncks])
-
-    assert slow_len > normal_len
-    assert normal_len > fast_len
-
-
-def test_tortoise():
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    use_gpu = torch.cuda.is_available()
-    if use_gpu:
-        run_cli(
-            f" tts --model_name  tts_models/en/multi-dataset/tortoise-v2 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda'
-        )
-    else:
-        run_cli(
-            f" tts --model_name  tts_models/en/multi-dataset/tortoise-v2 "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar'
-        )
-
-
-def test_bark():
-    """Bark is too big to run on github actions. We need to test it locally"""
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    use_gpu = torch.cuda.is_available()
-    if use_gpu:
-        run_cli(
-            f" tts --model_name  tts_models/multilingual/multi-dataset/bark "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda'
-        )
-    else:
-        run_cli(
-            f" tts --model_name  tts_models/multilingual/multi-dataset/bark "
-            f'--text "This is an example." --out_path "{output_path}" --no-progress_bar'
-        )
-
-
-def test_voice_conversion():
+def test_voice_conversion(tmp_path):
     print(" > Run voice conversion inference using YourTTS model.")
-    model_name = "tts_models/multilingual/multi-dataset/your_tts"
-    language_id = "en"
-    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
-    reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
-    output_path = os.path.join(get_tests_output_path(), "output.wav")
-    run_cli(
-        f"tts --model_name  {model_name}"
-        f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} --no-progress_bar"
-    )
-
-
-"""
-These are used to split tests into different actions on Github.
-"""
-
-
-def test_models_offset_0_step_3():
-    run_models(offset=0, step=3)
-
-
-def test_models_offset_1_step_3():
-    run_models(offset=1, step=3)
-
-
-def test_models_offset_2_step_3():
-    run_models(offset=2, step=3)
+    args = [
+        "--model_name",
+        "tts_models/multilingual/multi-dataset/your_tts",
+        "--out_path",
+        str(tmp_path / "output.wav"),
+        "--speaker_wav",
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"),
+        "--reference_wav",
+        os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav"),
+        "--language_idx",
+        "en",
+        "--no-progress_bar",
+    ]
+    run_main(main, args)