Add vision support for receipt analysis and enhance user messaging

Shuyib · Shuyib · commit a3edbb1a5df4 · 2025-01-01T13:18:48.000+03:00
diff --git a/app.py b/app.py
@@ -40,6 +40,8 @@
 from langtrace_python_sdk import langtrace, with_langtrace_root_span
 import ollama
 from utils.function_call import send_airtime, send_message, search_news, translate_text
+from utils.constants import VISION_SYSTEM_PROMPT, API_SYSTEM_PROMPT
+from utils.models import ReceiptData, LineItem
 
 # ------------------------------------------------------------------------------------
 # Logging Configuration
@@ -296,6 +298,9 @@ async def process_user_message(
 
     messages = []
 
+    # Set the system prompt based on the vision flag
+    system_prompt = VISION_SYSTEM_PROMPT if use_vision else API_SYSTEM_PROMPT
+
     # Construct message based on vision flag
     if use_vision:
         messages.append(
@@ -306,6 +311,7 @@ async def process_user_message(
             }
         )
     else:
+        messages.append({"role": "system", "content": system_prompt})
         messages.append({"role": "user", "content": message})
 
     try:
@@ -316,6 +322,9 @@ async def process_user_message(
             model=model_name,
             messages=messages,
             tools=None if use_vision else tools,  # Vision models don't use tools
+            options={
+                "temperature": 0
+            },  # Set temperature to 0 for deterministic responses
         )
     except Exception as e:
         logger.exception("Failed to get response from Ollama client.")
@@ -442,6 +451,8 @@ def gradio_interface(message: str, history: list) -> str:
         "- `Send airtime to +254712345678 with an amount of 10 in currency KES` 📞\n"
         "- `Send a message to +254712345678 with the message 'Hello there', using the username 'username'` 💬\n"
         "- `Search news for 'latest technology trends'` 📰\n\n"
+        "You can also translate text to a target language by typing:\n"
+        "- `Translate the text 'Hi' to the target language 'French'` 🌐\n\n"
         "Please enter your command below to get started. 🚀"
     ),
     examples=[
diff --git a/utils/constants.py b/utils/constants.py
@@ -0,0 +1,13 @@
+VISION_SYSTEM_PROMPT = """You are a precise receipt and invoice parsing assistant. Your tasks:
+- Extract merchant details, dates, amounts
+- Identify line items with quantities and prices
+- Detect payment methods and receipt numbers
+- Calculate totals and taxes
+- Extract all visible text
+Format response according to the provided schema."""
+
+API_SYSTEM_PROMPT = """You are a communication API assistant specialized in executing specific commands:
+- Send airtime: Requires phone number and amount
+- Send messages: Requires recipient and content
+- Search news: Requires query
+- Translate text: Requires text and target language"""
diff --git a/utils/models.py b/utils/models.py
@@ -0,0 +1,22 @@
+from typing import List, Optional
+from pydantic import BaseModel
+from datetime import datetime
+
+
+class LineItem(BaseModel):
+    description: str
+    quantity: Optional[float]
+    unit_price: Optional[float]
+    total: Optional[float]
+
+
+class ReceiptData(BaseModel):
+    merchant_name: str
+    date: Optional[datetime]
+    total_amount: float
+    currency: str
+    items: List[LineItem]
+    tax_amount: Optional[float]
+    payment_method: Optional[str]
+    receipt_number: Optional[str]
+    detected_text: str
diff --git a/voice_stt_mode.py b/voice_stt_mode.py
@@ -48,6 +48,9 @@
 
 # Local Module Imports
 from utils.function_call import send_airtime, send_message, search_news, translate_text
+from typing import Optional
+from utils.models import ReceiptData, LineItem
+from utils.constants import VISION_SYSTEM_PROMPT, API_SYSTEM_PROMPT
 
 # ------------------------------------------------------------------------------------
 # Logging Configuration
@@ -223,7 +226,12 @@
 
 
 @with_langtrace_root_span()
-async def process_user_message(message: str, history: list) -> str:
+async def process_user_message(
+    message: str,
+    history: list,
+    use_vision: bool = False,
+    image_path: Optional[str] = None,
+) -> str:
     """
     Handle the conversation with the model asynchronously.
 
@@ -233,6 +241,10 @@ async def process_user_message(message: str, history: list) -> str:
         The user's input message.
     history : list of list of str
         The conversation history up to that point.
+    use_vision : bool, optional
+        Whether to use vision model for processing (default is False).
+    image_path : str, optional
+        Path to the image file for vision model (default is None).
 
     Returns
     -------
@@ -241,19 +253,26 @@ async def process_user_message(message: str, history: list) -> str:
     """
     logger.info("Processing user message: %s", message)
     client = ollama.AsyncClient()
+    messages = []
 
-    messages = [
-        {
-            "role": "user",
-            "content": message,
-        }
-    ]
+    # Add system prompt
+    system_prompt = VISION_SYSTEM_PROMPT if use_vision else API_SYSTEM_PROMPT
+    messages.append({"role": "system", "content": system_prompt})
+
+    # Add user message with image if present
+    if use_vision and image_path:
+        messages.append({"role": "user", "content": message, "images": [image_path]})
+    else:
+        messages.append({"role": "user", "content": message})
 
     try:
+        model_name = "llama3.2-vision" if use_vision else "qwen2.5:0.5b"
         response = await client.chat(
-            model="qwen2.5:0.5b",
+            model=model_name,
             messages=messages,
-            tools=tools,
+            tools=None if use_vision else tools,
+            format=ReceiptData.model_json_schema() if use_vision else None,
+            options={"temperature": 0},
         )
     except Exception as e:
         logger.exception("Failed to get response from Ollama client.")
@@ -417,7 +436,7 @@ def gradio_interface(message: str, history: list) -> str:
 # Create Gradio Interface with Both Text and Audio Inputs
 # ------------------------------------------------------------------------------------
 
-with gr.Blocks(title="🎙️ Voice Command Communication Interface 🌍") as demo:
+with gr.Blocks(title="🎙️ Voice & Vision Communication Interface 🌍") as demo:
     gr.Markdown("# Voice Command & Text Communication Interface")
 
     # Add tabs for voice and text input
@@ -527,6 +546,21 @@ def show_transcription(audio):
             type="messages",
         )
 
+    with gr.Tab("Receipt Scanner"):
+        image_input = gr.Image(type="filepath", label="Upload Receipt/Invoice")
+        scan_button = gr.Button("Scan Receipt")
+        result_text = gr.Textbox(label="Analysis Result")
+
+        scan_button.click(
+            fn=lambda img: asyncio.run(
+                process_user_message(
+                    "Analyze this receipt", [], use_vision=True, image_path=img
+                )
+            ),
+            inputs=image_input,
+            outputs=result_text,
+        )
+
 if __name__ == "__main__":
     try:
         logger.info("Launching Gradio interface...")