Skip to content

Commit 9383697

Browse files
committed
Add a TTS system with various voice options.
1 parent d4d30cf commit 9383697

File tree

1 file changed

+63
-8
lines changed

1 file changed

+63
-8
lines changed

voice_stt_mode.py

+63-8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Airtime and Messaging Service using Africa's Talking API
2+
Airtime and Messaging Servicea using Africa's Talking API
33
44
This script provides a Gradio-based web interface for sending airtime and messages
55
using the Africa's Talking API. It also tracks the carbon emissions of the operations
@@ -37,6 +37,7 @@
3737
from logging.handlers import RotatingFileHandler
3838
import asyncio
3939
from importlib.metadata import version, PackageNotFoundError
40+
import tempfile
4041

4142
# Third-Party Library Imports
4243
import gradio as gr
@@ -45,6 +46,7 @@
4546
import numpy as np
4647
import soundfile as sf
4748
import ollama
49+
import edge_tts
4850

4951
# Local Module Imports
5052
from utils.function_call import send_airtime, send_message, search_news, translate_text
@@ -60,7 +62,6 @@
6062
langtrace.init(api_key=os.getenv("LANGTRACE_API_KEY"))
6163
groq_client = groq.Client(api_key=os.getenv("GROQ_API_KEY"))
6264

63-
6465
# Set up the logger
6566
logger = logging.getLogger(__name__)
6667
logger.setLevel(logging.DEBUG) # Set the logger to handle all levels DEBUG and above
@@ -109,6 +110,7 @@
109110
"groq",
110111
"soundfile",
111112
"numpy",
113+
"edge-tts", # Add edge-tts to version checking
112114
]
113115

114116
for pkg in pkgs:
@@ -120,6 +122,24 @@
120122
except Exception as e:
121123
logger.error("Failed to retrieve version for %s: %s", pkg, str(e))
122124

125+
# ------------------------------------------------------------------------------------
126+
# Add TTS Configuration after version checking
127+
# ------------------------------------------------------------------------------------
128+
129+
VOICE = "sw-TZ-RehemaNeural"
130+
OUTPUT_FILE = "tts_output.mp3" # Saved in current working directory
131+
132+
133+
async def text_to_speech(text: str) -> None:
134+
try:
135+
communicate = edge_tts.Communicate(text, VOICE)
136+
await communicate.save(OUTPUT_FILE)
137+
logger.info(f"Generated speech output: {OUTPUT_FILE}")
138+
except Exception as e:
139+
logger.error(f"TTS Error: {str(e)}")
140+
raise
141+
142+
123143
# ------------------------------------------------------------------------------------
124144
# Define Tools Schema
125145
# ------------------------------------------------------------------------------------
@@ -384,12 +404,12 @@ async def process_audio_and_llm(audio):
384404
y /= np.max(np.abs(y))
385405

386406
# Write audio to buffer
407+
buffer = io.BytesIO()
387408
sf.write(buffer, y, sr, format="wav")
388409
buffer.seek(0)
389410

390411
try:
391412
# Get transcription from Groq
392-
# add the import here then text will be cut out for the client
393413
transcription = groq_client.audio.transcriptions.create(
394414
model="distil-whisper-large-v3-en",
395415
file=("audio.wav", buffer),
@@ -471,6 +491,9 @@ def gradio_interface(message: str, history: list) -> str:
471491
audio_output = gr.Textbox(
472492
label="Final Result", placeholder="LLM response will appear here..."
473493
)
494+
tts_button = gr.Button("Play TTS")
495+
tts_audio = gr.Audio(label="TTS Output")
496+
474497
with gr.Row():
475498
transcribe_button = gr.Button("Transcribe")
476499
process_button = gr.Button("Process Edited Text", variant="primary")
@@ -521,6 +544,20 @@ def show_transcription(audio):
521544
logger.exception("Error during transcription: %s", e)
522545
return f"Error: {str(e)}"
523546

547+
# Define TTS Function
548+
async def generate_tts(text: str) -> str:
549+
"""
550+
Generate TTS audio and return the file path.
551+
"""
552+
try:
553+
communicate = edge_tts.Communicate(text, VOICE)
554+
await communicate.save(OUTPUT_FILE)
555+
logger.info(f"TTS audio generated successfully: {OUTPUT_FILE}")
556+
return OUTPUT_FILE
557+
except Exception as e:
558+
logger.error(f"TTS Generation Error: {str(e)}")
559+
return None
560+
524561
# Wire up the components
525562
transcribe_button.click(
526563
fn=show_transcription, inputs=audio_input, outputs=transcription_preview
@@ -533,6 +570,13 @@ def show_transcription(audio):
533570
outputs=audio_output,
534571
)
535572

573+
# Connect TTS Button to Function
574+
tts_button.click(
575+
fn=lambda txt: asyncio.run(generate_tts(txt)),
576+
inputs=audio_output, # Replace with the component holding the final text
577+
outputs=tts_audio,
578+
)
579+
536580
# Text input tab
537581
with gr.Tab("Text Input"):
538582
chat_interface = gr.ChatInterface(
@@ -551,16 +595,27 @@ def show_transcription(audio):
551595
scan_button = gr.Button("Scan Receipt")
552596
result_text = gr.Textbox(label="Analysis Result")
553597

554-
scan_button.click(
555-
fn=lambda img: asyncio.run(
556-
process_user_message(
557-
"Analyze this receipt", [], use_vision=True, image_path=img
598+
async def process_with_speech(image):
599+
try:
600+
# Get text result first
601+
text_result = await process_user_message(
602+
"Analyze this receipt", [], use_vision=True, image_path=image
558603
)
559-
),
604+
return text_result
605+
except Exception as e:
606+
logger.error(f"Processing error: {str(e)}")
607+
return str(e)
608+
609+
scan_button.click(
610+
fn=lambda img: asyncio.run(process_with_speech(img)),
560611
inputs=image_input,
561612
outputs=result_text,
562613
)
563614

615+
# ------------------------------------------------------------------------------------
616+
# Launch Gradio Interface
617+
# ------------------------------------------------------------------------------------
618+
564619
if __name__ == "__main__":
565620
try:
566621
logger.info("Launching Gradio interface...")

0 commit comments

Comments
 (0)