48
48
49
49
# Local Module Imports
50
50
from utils .function_call import send_airtime , send_message , search_news , translate_text
51
+ from typing import Optional
52
+ from utils .models import ReceiptData , LineItem
53
+ from utils .constants import VISION_SYSTEM_PROMPT , API_SYSTEM_PROMPT
51
54
52
55
# ------------------------------------------------------------------------------------
53
56
# Logging Configuration
223
226
224
227
225
228
@with_langtrace_root_span ()
226
- async def process_user_message (message : str , history : list ) -> str :
229
+ async def process_user_message (
230
+ message : str ,
231
+ history : list ,
232
+ use_vision : bool = False ,
233
+ image_path : Optional [str ] = None ,
234
+ ) -> str :
227
235
"""
228
236
Handle the conversation with the model asynchronously.
229
237
@@ -233,6 +241,10 @@ async def process_user_message(message: str, history: list) -> str:
233
241
The user's input message.
234
242
history : list of list of str
235
243
The conversation history up to that point.
244
+ use_vision : bool, optional
245
+ Whether to use vision model for processing (default is False).
246
+ image_path : str, optional
247
+ Path to the image file for vision model (default is None).
236
248
237
249
Returns
238
250
-------
@@ -241,19 +253,26 @@ async def process_user_message(message: str, history: list) -> str:
241
253
"""
242
254
logger .info ("Processing user message: %s" , message )
243
255
client = ollama .AsyncClient ()
256
+ messages = []
244
257
245
- messages = [
246
- {
247
- "role" : "user" ,
248
- "content" : message ,
249
- }
250
- ]
258
+ # Add system prompt
259
+ system_prompt = VISION_SYSTEM_PROMPT if use_vision else API_SYSTEM_PROMPT
260
+ messages .append ({"role" : "system" , "content" : system_prompt })
261
+
262
+ # Add user message with image if present
263
+ if use_vision and image_path :
264
+ messages .append ({"role" : "user" , "content" : message , "images" : [image_path ]})
265
+ else :
266
+ messages .append ({"role" : "user" , "content" : message })
251
267
252
268
try :
269
+ model_name = "llama3.2-vision" if use_vision else "qwen2.5:0.5b"
253
270
response = await client .chat (
254
- model = "qwen2.5:0.5b" ,
271
+ model = model_name ,
255
272
messages = messages ,
256
- tools = tools ,
273
+ tools = None if use_vision else tools ,
274
+ format = ReceiptData .model_json_schema () if use_vision else None ,
275
+ options = {"temperature" : 0 },
257
276
)
258
277
except Exception as e :
259
278
logger .exception ("Failed to get response from Ollama client." )
@@ -417,7 +436,7 @@ def gradio_interface(message: str, history: list) -> str:
417
436
# Create Gradio Interface with Both Text and Audio Inputs
418
437
# ------------------------------------------------------------------------------------
419
438
420
- with gr .Blocks (title = "🎙️ Voice Command Communication Interface 🌍" ) as demo :
439
+ with gr .Blocks (title = "🎙️ Voice & Vision Communication Interface 🌍" ) as demo :
421
440
gr .Markdown ("# Voice Command & Text Communication Interface" )
422
441
423
442
# Add tabs for voice and text input
@@ -527,6 +546,21 @@ def show_transcription(audio):
527
546
type = "messages" ,
528
547
)
529
548
549
+ with gr .Tab ("Receipt Scanner" ):
550
+ image_input = gr .Image (type = "filepath" , label = "Upload Receipt/Invoice" )
551
+ scan_button = gr .Button ("Scan Receipt" )
552
+ result_text = gr .Textbox (label = "Analysis Result" )
553
+
554
+ scan_button .click (
555
+ fn = lambda img : asyncio .run (
556
+ process_user_message (
557
+ "Analyze this receipt" , [], use_vision = True , image_path = img
558
+ )
559
+ ),
560
+ inputs = image_input ,
561
+ outputs = result_text ,
562
+ )
563
+
530
564
if __name__ == "__main__" :
531
565
try :
532
566
logger .info ("Launching Gradio interface..." )
0 commit comments