Skip to content

Commit a3edbb1

Browse files
committed
Add vision support for receipt analysis and enhance user messaging
1 parent e19ce18 commit a3edbb1

File tree

4 files changed

+90
-10
lines changed

4 files changed

+90
-10
lines changed

app.py

+11
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
from langtrace_python_sdk import langtrace, with_langtrace_root_span
4141
import ollama
4242
from utils.function_call import send_airtime, send_message, search_news, translate_text
43+
from utils.constants import VISION_SYSTEM_PROMPT, API_SYSTEM_PROMPT
44+
from utils.models import ReceiptData, LineItem
4345

4446
# ------------------------------------------------------------------------------------
4547
# Logging Configuration
@@ -296,6 +298,9 @@ async def process_user_message(
296298

297299
messages = []
298300

301+
# Set the system prompt based on the vision flag
302+
system_prompt = VISION_SYSTEM_PROMPT if use_vision else API_SYSTEM_PROMPT
303+
299304
# Construct message based on vision flag
300305
if use_vision:
301306
messages.append(
@@ -306,6 +311,7 @@ async def process_user_message(
306311
}
307312
)
308313
else:
314+
messages.append({"role": "system", "content": system_prompt})
309315
messages.append({"role": "user", "content": message})
310316

311317
try:
@@ -316,6 +322,9 @@ async def process_user_message(
316322
model=model_name,
317323
messages=messages,
318324
tools=None if use_vision else tools, # Vision models don't use tools
325+
options={
326+
"temperature": 0
327+
}, # Set temperature to 0 for deterministic responses
319328
)
320329
except Exception as e:
321330
logger.exception("Failed to get response from Ollama client.")
@@ -442,6 +451,8 @@ def gradio_interface(message: str, history: list) -> str:
442451
"- `Send airtime to +254712345678 with an amount of 10 in currency KES` 📞\n"
443452
"- `Send a message to +254712345678 with the message 'Hello there', using the username 'username'` 💬\n"
444453
"- `Search news for 'latest technology trends'` 📰\n\n"
454+
"You can also translate text to a target language by typing:\n"
455+
"- `Translate the text 'Hi' to the target language 'French'` 🌐\n\n"
445456
"Please enter your command below to get started. 🚀"
446457
),
447458
examples=[

utils/constants.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
VISION_SYSTEM_PROMPT = """You are a precise receipt and invoice parsing assistant. Your tasks:
2+
- Extract merchant details, dates, amounts
3+
- Identify line items with quantities and prices
4+
- Detect payment methods and receipt numbers
5+
- Calculate totals and taxes
6+
- Extract all visible text
7+
Format response according to the provided schema."""
8+
9+
API_SYSTEM_PROMPT = """You are a communication API assistant specialized in executing specific commands:
10+
- Send airtime: Requires phone number and amount
11+
- Send messages: Requires recipient and content
12+
- Search news: Requires query
13+
- Translate text: Requires text and target language"""

utils/models.py

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from typing import List, Optional
2+
from pydantic import BaseModel
3+
from datetime import datetime
4+
5+
6+
class LineItem(BaseModel):
7+
description: str
8+
quantity: Optional[float]
9+
unit_price: Optional[float]
10+
total: Optional[float]
11+
12+
13+
class ReceiptData(BaseModel):
14+
merchant_name: str
15+
date: Optional[datetime]
16+
total_amount: float
17+
currency: str
18+
items: List[LineItem]
19+
tax_amount: Optional[float]
20+
payment_method: Optional[str]
21+
receipt_number: Optional[str]
22+
detected_text: str

voice_stt_mode.py

+44-10
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@
4848

4949
# Local Module Imports
5050
from utils.function_call import send_airtime, send_message, search_news, translate_text
51+
from typing import Optional
52+
from utils.models import ReceiptData, LineItem
53+
from utils.constants import VISION_SYSTEM_PROMPT, API_SYSTEM_PROMPT
5154

5255
# ------------------------------------------------------------------------------------
5356
# Logging Configuration
@@ -223,7 +226,12 @@
223226

224227

225228
@with_langtrace_root_span()
226-
async def process_user_message(message: str, history: list) -> str:
229+
async def process_user_message(
230+
message: str,
231+
history: list,
232+
use_vision: bool = False,
233+
image_path: Optional[str] = None,
234+
) -> str:
227235
"""
228236
Handle the conversation with the model asynchronously.
229237
@@ -233,6 +241,10 @@ async def process_user_message(message: str, history: list) -> str:
233241
The user's input message.
234242
history : list of list of str
235243
The conversation history up to that point.
244+
use_vision : bool, optional
245+
Whether to use vision model for processing (default is False).
246+
image_path : str, optional
247+
Path to the image file for vision model (default is None).
236248
237249
Returns
238250
-------
@@ -241,19 +253,26 @@ async def process_user_message(message: str, history: list) -> str:
241253
"""
242254
logger.info("Processing user message: %s", message)
243255
client = ollama.AsyncClient()
256+
messages = []
244257

245-
messages = [
246-
{
247-
"role": "user",
248-
"content": message,
249-
}
250-
]
258+
# Add system prompt
259+
system_prompt = VISION_SYSTEM_PROMPT if use_vision else API_SYSTEM_PROMPT
260+
messages.append({"role": "system", "content": system_prompt})
261+
262+
# Add user message with image if present
263+
if use_vision and image_path:
264+
messages.append({"role": "user", "content": message, "images": [image_path]})
265+
else:
266+
messages.append({"role": "user", "content": message})
251267

252268
try:
269+
model_name = "llama3.2-vision" if use_vision else "qwen2.5:0.5b"
253270
response = await client.chat(
254-
model="qwen2.5:0.5b",
271+
model=model_name,
255272
messages=messages,
256-
tools=tools,
273+
tools=None if use_vision else tools,
274+
format=ReceiptData.model_json_schema() if use_vision else None,
275+
options={"temperature": 0},
257276
)
258277
except Exception as e:
259278
logger.exception("Failed to get response from Ollama client.")
@@ -417,7 +436,7 @@ def gradio_interface(message: str, history: list) -> str:
417436
# Create Gradio Interface with Both Text and Audio Inputs
418437
# ------------------------------------------------------------------------------------
419438

420-
with gr.Blocks(title="🎙️ Voice Command Communication Interface 🌍") as demo:
439+
with gr.Blocks(title="🎙️ Voice & Vision Communication Interface 🌍") as demo:
421440
gr.Markdown("# Voice Command & Text Communication Interface")
422441

423442
# Add tabs for voice and text input
@@ -527,6 +546,21 @@ def show_transcription(audio):
527546
type="messages",
528547
)
529548

549+
with gr.Tab("Receipt Scanner"):
550+
image_input = gr.Image(type="filepath", label="Upload Receipt/Invoice")
551+
scan_button = gr.Button("Scan Receipt")
552+
result_text = gr.Textbox(label="Analysis Result")
553+
554+
scan_button.click(
555+
fn=lambda img: asyncio.run(
556+
process_user_message(
557+
"Analyze this receipt", [], use_vision=True, image_path=img
558+
)
559+
),
560+
inputs=image_input,
561+
outputs=result_text,
562+
)
563+
530564
if __name__ == "__main__":
531565
try:
532566
logger.info("Launching Gradio interface...")

0 commit comments

Comments
 (0)