huggingface · GTimothee · Apr 15, 2025 · Apr 15, 2025 · Apr 15, 2025 · Apr 15, 2025
diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py
@@ -271,10 +271,17 @@ def log_user_message(self, text_input, file_uploads_log):
             gr.Button(interactive=False),
         )
 
-    def launch(self, share: bool = True, **kwargs):
-        self.create_app().launch(debug=True, share=share, **kwargs)
+    def launch(self, share: bool = True, speech2text_func = None, **kwargs):
+        self.create_app(speech2text_func).launch(debug=True, share=share, **kwargs)
+
+    def create_app(self, speech2text_func = None):
+
+        def handle_input(text_input, audio_input):
+            if audio_input: 
+                return speech2text_func(audio_input)
+            return text_input
+
 
-    def create_app(self):
         import gradio as gr
 
         with gr.Blocks(theme="ocean", fill_height=True) as demo:
@@ -298,6 +305,18 @@ def create_app(self):
                         container=False,
                         placeholder="Enter your prompt here and press Shift+Enter or press the button",
                     )
+                    if speech2text_func:
+                        audio_input = gr.Audio(
+                            sources=["upload", "microphone"], 
+                            label="Voice Input",
+                            waveform_options=gr.WaveformOptions(
+                            waveform_color="#01C6FF",
+                            waveform_progress_color="#0066B4",
+                            skip_length=2,
+                            show_controls=False,
+                        ),)
+                    else:
+                        audio_input = None
                     submit_btn = gr.Button("Submit", variant="primary")
 
                 # If an upload folder is provided, enable the upload feature
@@ -346,10 +365,22 @@ def create_app(self):
             )
 
             submit_btn.click(
+                handle_input,
+                inputs=[text_input, audio_input], 
+                outputs=[text_input],  # Update the text input with transcribed text if audio is provided
+            ).then(
+                lambda: None,
+                inputs=None,
+                outputs=[audio_input],  # Clear audio_input
+            ).then(
                 self.log_user_message,
                 [text_input, file_uploads_log],
                 [stored_messages, text_input, submit_btn],
-            ).then(self.interact_with_agent, [stored_messages, chatbot, session_state], [chatbot]).then(
+            ).then(
+                self.interact_with_agent,
+                [stored_messages, chatbot, session_state],
+                [chatbot],
+            ).then(
                 lambda: (
                     gr.Textbox(
                         interactive=True, placeholder="Enter your prompt here and press Shift+Enter or the button"