Skip to content

Commit

Permalink
use eleven_multilingual_v2 model for improved stability, accuracy and…
Browse files Browse the repository at this point in the history
… quality
  • Loading branch information
nwaughachukwuma committed Oct 31, 2024
1 parent f14f71d commit 162f886
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 27 deletions.
2 changes: 1 addition & 1 deletion src/utils/audio_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ async def text_to_speech(self, audio_script: str, output_file: str):
"""
tags = self._get_tags(audio_script)
audio_script = clean_tss_markup(audio_script, tags)

nway_content = self.split_content(audio_script, tags)

print(f"nway_content: {nway_content}")

if self.config.tts_provider == "openai":
Expand Down
10 changes: 8 additions & 2 deletions src/utils/audio_manager_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@
from pathlib import Path
from typing import Any, List, Optional, Tuple

from src.utils.generate_speech_utils import GenerateSpeech, SpeechJob, TTSProvider
from src.utils.generate_speech_utils import (
ElevenLabsVoice,
GenerateSpeech,
OpenaiVoice,
SpeechJob,
TTSProvider,
)


@dataclass
Expand Down Expand Up @@ -36,7 +42,7 @@ def _prepare_speech_jobs(
self,
nway_content: List[Tuple[str, str]],
tags: List[str],
voices: List[Any],
voices: List[OpenaiVoice] | List[ElevenLabsVoice],
temp_audio_dir: str,
):
jobs: List[SpeechJob] = []
Expand Down
36 changes: 36 additions & 0 deletions src/utils/decorators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import asyncio
from functools import wraps
from time import time


def process_time():
"""Print process execution time for a given function"""

def decorator(func):
if asyncio.iscoroutinefunction(func):

@wraps(func)
async def async_wrapper(*args, **kwargs):
start_time = time()
response = await func(*args, **kwargs)

time_diff = f"{(time() - start_time):.2f}s"
print(f"Execution time for {func.__name__}: {time_diff}")

return response

return async_wrapper

@wraps(func)
def wrapper(*args, **kwargs):
start_time = time()
response = func(*args, **kwargs)

time_diff = f"{(time() - start_time):.2f}s"
print(f"Execution time for {func.__name__}: {time_diff}")

return response

return wrapper

return decorator
62 changes: 44 additions & 18 deletions src/utils/generate_speech_utils.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,44 @@
from dataclasses import dataclass
from io import BytesIO
from typing import List, Literal

from elevenlabs import VoiceSettings
from typing import Dict, List, Literal

from src.services.elevenlabs_client import get_elevenlabs_client
from src.services.openai_client import get_openai
from src.utils.decorators import process_time

TTSProvider = Literal["openai", "elevenlabs"]

OpenaiVoice = Literal["onyx", "shimmer", "echo", "nova", "alloy"]
openai_voices: List[OpenaiVoice] = ["onyx", "shimmer", "echo", "nova", "alloy"]

ElevenLabsVoice = Literal[
"Adam", "Sarah", "Laura", "Charlie", "George", "Charlotte", "Liam"
]
elevenlabs_voices = ["Adam", "Sarah", "Laura", "Charlie", "George", "Charlotte", "Liam"]
elevenlabs_voices: List[ElevenLabsVoice] = [
"Adam",
"Sarah",
"Laura",
"Charlie",
"George",
"Charlotte",
"Liam",
]

elevenlabs_voice_to_id: Dict[ElevenLabsVoice, str] = {
"Adam": "pNInz6obpgDQGcFmaJgB",
"Sarah": "EXAVITQu4vr4xnSDxMaL",
"Laura": "FGY2WhTYpPnrIDTdsKH5",
"Charlie": "IKne3meq5aSn9XLyUdCD",
"George": "JBFqnCBsd6RMkjVDRZzb",
"Charlotte": "XB0fDUnXU5powFXDhCwa",
"Liam": "TX3LPaxmHKxFdv7VOQHJ",
}


@dataclass
class SpeechJob:
content: str
voice: OpenaiVoice
voice: OpenaiVoice | ElevenLabsVoice
output_file: str
tag: str
index: int
Expand All @@ -35,36 +53,44 @@ def __init__(self, provider: TTSProvider):
def run(self, job: SpeechJob):
"""Generate speech using the specified provider"""
try:
content = (
self.__use_openai(job)
if self.provider == "elevenlabs"
else self.__use_elevenlabs(job)
)
if self.provider == "elevenlabs":
content = self.__use_elevenlabs(job)
else:
content = self.__use_openai(job)

with open(job.output_file, "wb") as file:
file.write(content)

print(f"Generated speech for tag {job.tag} at index {job.index}")
return job.output_file
except Exception as e:
print(f"Failed to generate speech for tag {job.tag}: {str(e)}")
print(f"Failed to generate speech for tag: {job.tag}. Error: {str(e)}")
return ""

@process_time()
def __use_openai(self, job: SpeechJob):
if job.voice not in openai_voices:
raise ValueError("Wrong voice specification for openai tts")

response = get_openai().audio.speech.create(
input=job.content, model="tts-1-hd", voice=job.voice
)
return response.content

@process_time()
def __use_elevenlabs(self, job: SpeechJob):
response = get_elevenlabs_client().text_to_speech.convert(
voice_id=job.voice,
output_format="mp3_22050_32",
if job.voice not in elevenlabs_voices:
raise ValueError("Wrong voice specification for elevenlabs tts")
# response = get_elevenlabs_client().text_to_speech.convert(
# model_id="eleven_turbo_v2_5", # use the turbo model for low latency
# text=job.content,
# voice_id=elevenlabs_voice_to_id[job.voice],
# output_format="mp3_22050_32",
# )
response = get_elevenlabs_client().generate(
model="eleven_multilingual_v2",
text=job.content,
model_id="eleven_turbo_v2_5", # use the turbo model for low latency
voice_settings=VoiceSettings(
stability=0.0, similarity_boost=1.0, style=0.0, use_speaker_boost=True
),
voice=job.voice,
)

buffer = BytesIO()
Expand Down
10 changes: 4 additions & 6 deletions src/utils/main_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
from pydantic import BaseModel

from src.services.storage import StorageManager
from src.utils.audio_manager import AudioManager

# from src.utils.audio_synthesizer import AudioSynthesizer
from src.utils.audio_manager import AudioManager, AudioManagerConfig
from src.utils.audiocast_request import AudioScriptMaker, generate_source_content
from src.utils.chat_request import chat_request
from src.utils.chat_utils import (
Expand Down Expand Up @@ -81,10 +79,10 @@ async def generate_audiocast(request: GenerateAudioCastRequest):
# STEP 3: Generate audio from the audio script
with container.container():
container.info("Generating audio...")
output_file = await AudioManager().generate_speech(audio_script)
output_file = await AudioManager(
custom_config=AudioManagerConfig(tts_provider="elevenlabs")
).generate_speech(audio_script)

# container.info("Enhancing audio quality...")
# AudioSynthesizer().enhance_audio_minimal(Path(output_file))
print(f"output_file: {output_file}")

# TODO: Use a background service
Expand Down

0 comments on commit 162f886

Please sign in to comment.