diff --git a/Makefile b/Makefile index 9e1eedc..d4be705 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ venv/bin/activate: requirements.txt #requirements.txt is a requirement, otherwis # make command executable # chmod is a bash command, +x is giving the ADMIN user permission to execute it # if it's a+x, that means anyone can run it, even if you aren't an ADMIN - chmod +x .venv/bin/activate + chmod +x .venv/bin/activate # activate virtual environment . .venv/bin/activate @@ -37,19 +37,19 @@ install: venv/bin/activate requirements.txt # prerequisite docstring: activate # format docstring, might have to change this as well - # write a template using a numpydoc convention and output it to my python file + # write a template using a numpydoc convention and output it to my python file # so basically just document functions, classes etc. in the numpy style pyment -w -o numpydoc *.py -format: activate +format: activate # format code - black utils/*.py tests/*.py + black *.py utils/*.py tests/*.py clean: # clean directory of cache # files like pychache are gen'd after running py files - # the data speeds up execution of py files in subsequent runs - # reduces size of repo + # the data speeds up execution of py files in subsequent runs + # reduces size of repo # during version control, removing them would avoid conflicts with other dev's cached files # add code to remove ipynb checkpoints # the &&\ is used to say, after running this successfully, run the next... @@ -63,12 +63,12 @@ clean: rm -rf *.log rm -rf tests/__pycache__ -lint: activate install +lint: activate install #flake8 or #pylint # In this scenario it'll only tell as errors found in your code - # R - refactor + # R - refactor # C - convention - pylint --disable=R,C --errors-only *.py + pylint --disable=R,C --errors-only *.py test: activate install # run tests @@ -87,6 +87,10 @@ run_gradio: activate install format # run gradio $(PYTHON) app.py +run_gradio_stt: activate install format + # run gradio + $(PYTHON) voice_stt_mode.py + docker_build: Dockerfile #build container # docker build -t $(DOCKER_IMAGE_TAG) . @@ -95,7 +99,7 @@ docker_run_test: Dockerfile.app Dockerfile.ollama # linting Dockerfile docker run --rm -i hadolint/hadolint < Dockerfile.ollama docker run --rm -i hadolint/hadolint < Dockerfile.app - + docker_clean: Dockerfile.ollama Dockerfile.app # clean docker @@ -109,7 +113,7 @@ docker_run: Dockerfile.ollama Dockerfile.app # run docker # this is basically a test to see if a docker image is being created successfully docker-compose up --build - + setup_readme: ## Create a README.md @if [ ! -f README.md ]; then \ echo "# Project Name\n\ diff --git a/README.md b/README.md index 0cc32f1..cacdac9 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,11 @@ Function-calling with Python and ollama. We are going to use the Africa's Talkin NB: The phone numbers are placeholders for the actual phone numbers. You need some VRAM to run this project. You can get VRAM from [here](https://vast.ai/) -We recommend 400MB-8GB of VRAM for this project. It can run on CPU however, I recommend smaller models for this. +We recommend 400MB-8GB of VRAM for this project. It can run on CPU however, I recommend smaller models for this. -[Mistral 7B](https://ollama.com/library/mistral), **llama 3.2 3B/1B**, [**Qwen 2.5: 0.5/1.5B**](https://ollama.com/library/qwen2.5:1.5b), [nemotron-mini 4b](https://ollama.com/library/nemotron-mini) and [llama3.1 8B](https://ollama.com/library/llama3.1) are the recommended models for this project. +[Mistral 7B](https://ollama.com/library/mistral), **llama 3.2 3B/1B**, [**Qwen 2.5: 0.5/1.5B**](https://ollama.com/library/qwen2.5:1.5b), [nemotron-mini 4b](https://ollama.com/library/nemotron-mini) and [llama3.1 8B](https://ollama.com/library/llama3.1) are the recommended models for this project. -Ensure ollama is installed on your laptop/server and running before running this project. You can install ollama from [here](ollama.com) +Ensure ollama is installed on your laptop/server and running before running this project. You can install ollama from [here](ollama.com) Learn more about tool calling @@ -22,41 +22,41 @@ Learn more about tool calling - [Usage](#usage) - [Use cases](#use-cases) - [Responsible AI Practices](#responsible-ai-practices) -- [Limitations](#limitations) +- [Limitations](#limitations) - [Contributing](#contributing) -- [License](#license) +- [License](#license) ## File structure -. -├── Dockerfile.app - template to run the gradio dashboard. -├── Dockerfile.ollama - template to run the ollama server. -├── docker-compose.yml - use the ollama project and gradio dashboard. -├── docker-compose-codecarbon.yml - use the codecarbon project, ollama and gradio dashboard. -├── .env - This file contains the environment variables for the project. (Not included in the repository) -├── app.py - the function_call.py using gradio as the User Interface. -├── Makefile - This file contains the commands to run the project. -├── README.md - This file contains the project documentation. This is the file you are currently reading. -├── requirements.txt - This file contains the dependencies for the project. -├── summary.png - How function calling works with a diagram. -├── tests - This directory contains the test files for the project. -│ ├── __init__.py - This file initializes the tests directory as a package. -│ ├── test_cases.py - This file contains the test cases for the project. -│ └── test_run.py - This file contains the code to run the test cases for the function calling LLM. -└── utils - This directory contains the utility files for the project. - ├── __init__.py - This file initializes the utils directory as a package. - ├── function_call.py - This file contains the code to call a function using LLMs. - └── communication_apis.py - This file contains the code to do with communication apis & experiments. +. +├── Dockerfile.app - template to run the gradio dashboard. +├── Dockerfile.ollama - template to run the ollama server. +├── docker-compose.yml - use the ollama project and gradio dashboard. +├── docker-compose-codecarbon.yml - use the codecarbon project, ollama and gradio dashboard. +├── .env - This file contains the environment variables for the project. (Not included in the repository) +├── app.py - the function_call.py using gradio as the User Interface. +├── Makefile - This file contains the commands to run the project. +├── README.md - This file contains the project documentation. This is the file you are currently reading. +├── requirements.txt - This file contains the dependencies for the project. +├── summary.png - How function calling works with a diagram. +├── tests - This directory contains the test files for the project. +│ ├── __init__.py - This file initializes the tests directory as a package. +│ ├── test_cases.py - This file contains the test cases for the project. +│ └── test_run.py - This file contains the code to run the test cases for the function calling LLM. +└── utils - This directory contains the utility files for the project. + ├── __init__.py - This file initializes the utils directory as a package. + ├── function_call.py - This file contains the code to call a function using LLMs. + └── communication_apis.py - This file contains the code to do with communication apis & experiments. ### attribution -This project uses the Qwen2.5-0.5B model developed by Alibaba Cloud under the Apache License 2.0. The original project can be found at [Qwen technical report](https://arxiv.org/abs/2412.15115) +This project uses the Qwen2.5-0.5B model developed by Alibaba Cloud under the Apache License 2.0. The original project can be found at [Qwen technical report](https://arxiv.org/abs/2412.15115) ### License This project is licensed under the Apache License 2.0. See the [LICENSE](./LICENSE) file for more details. - + ## Installation -The project uses python 3.12. To install the project, follow the steps below: +The project uses python 3.12. To install the project, follow the steps below: - Clone the repository ```bash @@ -65,7 +65,7 @@ git clone https://github.com/Shuyib/tool_calling_api.git - Change directory to the project directory ```bash cd tool_calling_api -``` +``` Create a virtual environment ```bash python3 -m venv .venv @@ -88,7 +88,7 @@ make install ```bash make run ``` -Long way to run the project +Long way to run the project - Change directory to the utils directory ```bash @@ -121,82 +121,89 @@ make docker_run ``` Notes: -- The .env file contains the environment variables for the project. You can create a .env file and add the following environment variables: +- The .env file contains the environment variables for the project. You can create a .env file and add the following environment variables: ```bash echo "AT_API_KEY = yourapikey" >> .env echo "AT_USERNAME = yourusername" >> .env -echo "LANGTRACE_API_KEY= yourlangtraceapikey" >> .env +echo "GROQ_API_KEY = yourgroqapikey" >> .env +echo "LANGTRACE_API_KEY= yourlangtraceapikey" >> .env echo "TEST_PHONE_NUMBER = yourphonenumber" >> .env echo "TEST_PHONE_NUMBER_2 = yourphonenumber" >> .env echo "TEST_PHONE_NUMBER_3 = yourphonenumber" >> .env ``` -- The Dockerfile creates 2 images for the ollama server and the gradio dashboard. The ollama server is running on port 11434 and the gradio dashboard is running on port 7860 . You can access the gradio dashboard by visiting in your browser & the ollama server by visiting in your browser. They consume about 2.72GB of storage in the container. +- The Dockerfile creates 2 images for the ollama server and the gradio dashboard. The ollama server is running on port 11434 and the gradio dashboard is running on port 7860 . You can access the gradio dashboard by visiting in your browser & the ollama server by visiting in your browser. They consume about 2.72GB of storage in the container. - The docker-compose.yml file is used to run the ollama server and the gradio dashboard. The docker-compose-codecarbon.yml file is used to run the ollama server, the gradio dashboard and the codecarbon project. -- You can learn more about how to make this system even more secure. Do this [course](https://www.kaggle.com/learn-guide/5-day-genai#GenAI). +- You can learn more about how to make this system even more secure. Do this [course](https://www.kaggle.com/learn-guide/5-day-genai#GenAI). ## Run in runpod.io -Make an account if you haven't already. Once that's settled. +Make an account if you haven't already. Once that's settled. -- Click on Deploy under Pods. -- Select the cheapest option pod to deploy for example RTX 2000 Ada. -- This will create a jupyter lab instance. -- Follow the Installation steps in the terminal available. Until the make install. -- Run this command. Install ollama and serve it then redirect output to a log file. +- Click on Deploy under Pods. +- Select the cheapest option pod to deploy for example RTX 2000 Ada. +- This will create a jupyter lab instance. +- Follow the Installation steps in the terminal available. Until the make install. +- Run this command. Install ollama and serve it then redirect output to a log file. ```bash curl -fsSL https://ollama.com/install.sh | sh && ollama serve > ollama.log 2>&1 & ``` -- Install your preferred model in the same terminal. +- Install your preferred model in the same terminal. ```bash ollama run qwen2.5:0.5b ``` -- Export your credentials but, if you are using a .env file, you can skip this step. It will be useful for Docker. +- Export your credentials but, if you are using a .env file, you can skip this step. It will be useful for Docker. ```bash export AT_API_KEY=yourapikey export AT_USERNAME=yourusername +export GROQ_API_KEY=yourgroqapikey export LANGTRACE_API_KEY=yourlangtraceapikey export TEST_PHONE_NUMBER=yourphonenumber export TEST_PHONE_NUMBER_2=yourphonenumber export TEST_PHONE_NUMBER_3=yourphonenumber ``` -- Continue running the installation steps in the terminal. -- Send your first message and airtime with an LLM. 🌠 +- Continue running the installation steps in the terminal. +- Send your first message and airtime with an LLM. 🌠 -Read more about setting up ollama and serveless options & +Read more about setting up ollama and serveless options & ## Usage -This project uses LLMs to send airtime to a phone number. The difference is that we are going to use the Africa's Talking API to send airtime to a phone number using Natural language. Here are examples of prompts you can use to send airtime to a phone number: -- Send airtime to xxxxxxxxxx046 and xxxxxxxxxx524 with an amount of 10 in currency KES. +This project uses LLMs to send airtime to a phone number. The difference is that we are going to use the Africa's Talking API to send airtime to a phone number using Natural language. Here are examples of prompts you can use to send airtime to a phone number: +- Send airtime to xxxxxxxxxx046 and xxxxxxxxxx524 with an amount of 10 in currency KES. - Send a message to xxxxxxxxxx046 and xxxxxxxxxx524 with a message "Hello, how are you?", using the username "username". +## Updated Usage Instructions +- The app now supports both Text and Voice input tabs. +- In the Voice Input tab, record audio and click "Transcribe" to preview the transcription. Then click "Process Edited Text" to execute voice commands. +- In the Text Input tab, directly type commands to send airtime or messages or to search news. + ### Responsible AI Practices -This project implements several responsible AI practices: -- All test data is anonymized to protect privacy. -- Input validation to prevent misuse (negative amounts, spam detection). -- Handling of sensitive content and edge cases. -- Comprehensive test coverage for various scenarios. -- Secure handling of credentials and personal information. +This project implements several responsible AI practices: +- All test data is anonymized to protect privacy. +- Input validation to prevent misuse (negative amounts, spam detection). +- Handling of sensitive content and edge cases. +- Comprehensive test coverage for various scenarios. +- Secure handling of credentials and personal information. ![Process Summary](summary.png) ## Use cases - * Non-Technical User Interfaces: Simplifies the process for non-coders to interact with APIs, making it easier for them to send airtime and messages without needing to understand the underlying code. - * Customer Support Automation: Enables customer support teams to quickly send airtime or messages to clients using natural language commands, improving efficiency and response times. - * Marketing Campaigns: Facilitates the automation of promotional messages and airtime rewards to customers, enhancing engagement and retention. - * Emergency Notifications: Allows rapid dissemination of urgent alerts and notifications to a large number of recipients using simple prompts. - * Educational Tools: Provides a practical example for teaching how to integrate APIs with natural language processing, which can be beneficial for coding bootcamps and workshops. - * Multilingual Support: Supports multiple languages when sending messages and airtime, making it accessible to a diverse range of users. Testing for Arabic, French, English and Portuguese. + * Non-Technical User Interfaces: Simplifies the process for non-coders to interact with APIs, making it easier for them to send airtime and messages without needing to understand the underlying code. + * Customer Support Automation: Enables customer support teams to quickly send airtime or messages to clients using natural language commands, improving efficiency and response times. + * Marketing Campaigns: Facilitates the automation of promotional messages and airtime rewards to customers, enhancing engagement and retention. + * Emergency Notifications: Allows rapid dissemination of urgent alerts and notifications to a large number of recipients using simple prompts. + * Educational Tools: Provides a practical example for teaching how to integrate APIs with natural language processing, which can be beneficial for coding bootcamps and workshops. + * Multilingual Support: Supports multiple languages when sending messages and airtime, making it accessible to a diverse range of users. Testing for Arabic, French, English and Portuguese. ## Limitations -- The project is limited to sending airtime, searching for news, and messages using the Africa's Talking API. The functionality can be expanded to include other APIs and services. +- The project is limited to sending airtime, searching for news, and messages using the Africa's Talking API. The functionality can be expanded to include other APIs and services. -- The jailbreaking of the LLMS is a limitation. The LLMS are not perfect and can be manipulated to produce harmful outputs. This can be mitigated by using a secure environment and monitoring the outputs for any malicious content. However, the Best of N technique and prefix injection were effective in changing model behavior. +- The jailbreaking of the LLMS is a limitation. The LLMS are not perfect and can be manipulated to produce harmful outputs. This can be mitigated by using a secure environment and monitoring the outputs for any malicious content. However, the Best of N technique and prefix injection were effective in changing model behavior. -- A small number of test cases were used to test the project. More test cases can be added to cover a wider range of scenarios and edge cases. +- A small number of test cases were used to test the project. More test cases can be added to cover a wider range of scenarios and edge cases. ## Contributing Contributions are welcome. If you would like to contribute to the project, you can fork the repository, create a new branch, make your changes and then create a pull request. diff --git a/app.py b/app.py index cf6bfdc..b7c9aa0 100644 --- a/app.py +++ b/app.py @@ -6,58 +6,163 @@ using the CodeCarbon library. Usage: - 1. Set the environment variables `AT_USERNAME` and `AT_API_KEY` with your Africa's Talking credentials. + 1. Set the environment variables `AT_USERNAME` and `AT_API_KEY` with your + Africa's Talking credentials. 2. Run the script: `python app.py` - 3. Access the Gradio web interface to send airtime or messages or search for news articles. + 3. Access the Gradio web interface to send airtime or messages or search for + news articles. Example: Send airtime to a phone number: - `Send airtime to +254712345678 with an amount of 10 in currency KES` Send a message to a phone number: - - `Send a message to +254712345678 with the message 'Hello there', using the username 'username'` + - `Send a message to +254712345678 with the message 'Hello there', + using the username 'username'` Search for news about a topic: - `Latest news on climate change` """ +# ------------------------------------------------------------------------------------ +# Import Statements +# ------------------------------------------------------------------------------------ + +# Standard Library Imports import os import json import logging +from logging.handlers import RotatingFileHandler import asyncio +from importlib.metadata import version, PackageNotFoundError + +# Third-Party Library Imports import gradio as gr from langtrace_python_sdk import langtrace, with_langtrace_root_span import ollama from utils.function_call import send_airtime, send_message, search_news -# langtrace init -langtrace.init(api_key=os.getenv("LANGTRACE_API_KEY")) +# ------------------------------------------------------------------------------------ +# Logging Configuration +# ------------------------------------------------------------------------------------ + + +def setup_logger(): + """Sets up the logger with file and stream handlers. + + Parameters + ---------- + None + + Returns + ------- + + logger: logging.Logger + The logger object with the configured handlers. + + """ + + # Create a logger + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) # Capture all levels DEBUG and above -# Set up the logger -logger = logging.getLogger(__name__) + # Prevent logs from being propagated to the root logger to avoid duplication + logger.propagate = False -# Set up logging -logging.basicConfig(level=logging.DEBUG) + # Define logging format + formatter = logging.Formatter("%(asctime)s:%(name)s:%(levelname)s:%(message)s") + + # Set up the Rotating File Handler + # the log file will be rotated when it reaches 5MB and will keep the last 5 logs + file_handler = RotatingFileHandler( + "func_calling_app.log", maxBytes=5 * 1024 * 1024, backupCount=5 + ) + file_handler.setLevel(logging.INFO) # Capture INFO and above in the file + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + # Set up the Stream Handler for console output + stream_handler = logging.StreamHandler() + stream_handler.setLevel(logging.DEBUG) # Capture DEBUG and above in the console + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + return logger -# Logging format -formatter = logging.Formatter("%(asctime)s:%(name)s:%(levelname)s:%(message)s") -# Set up the file handler & stream handler -file_handler = logging.FileHandler("func_calling_app.log") -file_handler.setFormatter(formatter) -stream_handler = logging.StreamHandler() -stream_handler.setFormatter(formatter) +# Initialize logger +logger = setup_logger() -# Add the handlers to the logger -logger.addHandler(file_handler) -logger.addHandler(stream_handler) +# Initialize Langtrace +langtrace.init(api_key=os.getenv("LANGTRACE_API_KEY")) + +# ------------------------------------------------------------------------------------ +# Log the Start of the Script +# ------------------------------------------------------------------------------------ -# Log the start of the script logger.info( - "Starting the function calling script to send airtime and messages using the " - "Africa's Talking API" + "Starting the function calling script to send airtime and messages using Africa's Talking API" ) logger.info("Let's review the packages and their versions") -# Define tools schema +# ------------------------------------------------------------------------------------ +# Log Versions of the Libraries +# ------------------------------------------------------------------------------------ + +pkgs = ["langtrace-python-sdk", "gradio", "ollama", "codecarbon"] + +for pkg in pkgs: + try: + pkg_version = version(pkg) + logger.info("%s version: %s", pkg, pkg_version) + except PackageNotFoundError: + logger.error("Package %s is not installed.", pkg) + except Exception as e: + logger.error("Failed to retrieve version for %s: %s", pkg, str(e)) + +# ------------------------------------------------------------------------------------ +# Define Masking Functions +# ------------------------------------------------------------------------------------ + + +def mask_phone_number(phone_number): + """Hide the first digits of a phone number. Only the last 4 digits will be visible. + + Parameters + ---------- + phone_number : str + The phone number to mask. + + Returns + ------- + str + The masked phone number. + """ + if len(phone_number) < 4: + return "****" + return "x" * (len(phone_number) - 4) + phone_number[-4:] + + +def mask_api_key(api_key): + """Hide the first characters of an API key. Only the last 4 characters will be visible. + + Parameters + ---------- + api_key : str + The API key to mask. + + Returns + ------- + str + The masked API key. + """ + if len(api_key) < 4: + return "****" + return "x" * (len(api_key) - 4) + api_key[-4:] + + +# ------------------------------------------------------------------------------------ +# Define Tools Schema +# ------------------------------------------------------------------------------------ + tools = [ { "type": "function", @@ -133,6 +238,10 @@ }, ] +# ------------------------------------------------------------------------------------ +# Define Function to Process User Queries +# ------------------------------------------------------------------------------------ + @with_langtrace_root_span() async def process_user_message(message: str, history: list) -> str: @@ -151,7 +260,10 @@ async def process_user_message(message: str, history: list) -> str: str The model's response or the function execution result. """ - logger.info("Processing user message: %s", message) + masked_message = mask_phone_number( + message + ) # Assuming the message contains a phone number + logger.info("Processing user message: %s", masked_message) client = ollama.AsyncClient() messages = [ @@ -161,11 +273,13 @@ async def process_user_message(message: str, history: list) -> str: } ] - response = await client.chat( - model="qwen2.5:0.5b", - messages=messages, - tools=tools, - ) + try: + response = await client.chat( + model="qwen2.5:0.5b", messages=messages, tools=tools + ) + except Exception as e: + logger.exception("Failed to get response from Ollama client.") + return "An unexpected error occurred while communicating with the assistant." model_message = response.get("message", {}) model_content = model_message.get("content", "") @@ -178,47 +292,72 @@ async def process_user_message(message: str, history: list) -> str: "content": model_content, } ) - logger.debug("Model response: %s", response["message"]) + logger.debug("Model messages: %s", messages) if model_message.get("tool_calls"): for tool in model_message["tool_calls"]: tool_name = tool["function"]["name"] arguments = tool["function"]["arguments"] - logger.info("Tool call detected: %s", tool_name) - - if tool_name == "send_airtime": - logger.info("Calling send_airtime with arguments: %s", arguments) - function_response = send_airtime( - arguments["phone_number"], - arguments["currency_code"], - arguments["amount"], - ) - elif tool_name == "send_message": - logger.info("Calling send_message with arguments: %s", arguments) - function_response = send_message( - arguments["phone_number"], - arguments["message"], - arguments["username"], - ) - elif tool_name == "search_news": - logger.info("Calling search_news with arguments: %s", arguments) - function_response = search_news(arguments["query"]) - else: - function_response = json.dumps({"error": "Unknown function"}) - - logger.debug("Function response: %s", function_response) - messages.append( - { - "role": "tool", - "content": function_response, - } + + # Mask sensitive arguments before logging + masked_args = {} + for key, value in arguments.items(): + if "phone_number" in key: + masked_args[key] = mask_phone_number(value) + elif "api_key" in key: + masked_args[key] = mask_api_key(value) + else: + masked_args[key] = value + + # Fix string concatenation error by using proper string formatting + logger.info( + "Tool call detected: %s with arguments: %s", tool_name, str(masked_args) ) - return f"Function `{tool_name}` executed successfully. Response:\n{function_response}" + try: + if tool_name == "send_airtime": + logger.info("Calling send_airtime with arguments: %s", masked_args) + function_response = send_airtime( + arguments["phone_number"], + arguments["currency_code"], + arguments["amount"], + ) + elif tool_name == "send_message": + logger.info("Calling send_message with arguments: %s", masked_args) + function_response = send_message( + arguments["phone_number"], + arguments["message"], + arguments["username"], + ) + elif tool_name == "search_news": + logger.info("Calling search_news with arguments: %s", masked_args) + function_response = search_news(arguments["query"]) + else: + function_response = json.dumps({"error": "Unknown function"}) + logger.warning("Unknown function: %s", tool_name) + + logger.debug("Function response: %s", function_response) + messages.append( + { + "role": "tool", + "content": function_response, + } + ) + + return f"Function `{tool_name}` executed successfully.Response:\n{function_response}" # noqa C0301 + except Exception as e: + logger.exception("Error calling function %s: %s", tool_name, e) + return "An unexpected error occurred while processing your message." else: + logger.debug("No tool calls detected. Returning model content.") return model_content +# ------------------------------------------------------------------------------------ +# Set Up Gradio Interface +# ------------------------------------------------------------------------------------ + + def gradio_interface(message: str, history: list) -> str: """ Gradio interface function to process user messages and track emissions. @@ -235,11 +374,18 @@ def gradio_interface(message: str, history: list) -> str: str The model's response or the function execution result. """ - response = asyncio.run(process_user_message(message, history)) - return response + try: + response = asyncio.run(process_user_message(message, history)) + return response + except Exception as e: + logger.exception("Error processing user message: %s", e) + return "An unexpected error occurred while processing your message." -# Create Gradio interface +# ------------------------------------------------------------------------------------ +# Create Gradio Interface +# ------------------------------------------------------------------------------------ + iface = gr.ChatInterface( fn=gradio_interface, title="📱 Multi-Service Communication Interface 🌍", @@ -261,8 +407,21 @@ def gradio_interface(message: str, history: list) -> str: type="messages", ) -# Launch the Gradio interface -iface.launch(inbrowser=True, server_name="0.0.0.0", server_port=7860) +# ------------------------------------------------------------------------------------ +# Run the Gradio Interface +# ------------------------------------------------------------------------------------ + +if __name__ == "__main__": + try: + logger.info("Launching Gradio interface...") + iface.launch(inbrowser=True, server_name="0.0.0.0", server_port=7860) + logger.info("Gradio interface launched successfully.") + except Exception as e: + logger.exception("Error launching Gradio interface: %s", e) + + # Log the end of the script + logger.info("Script execution completed.") -# Log the end of the script -logger.info("Script execution completed") + # Flush logs to ensure all logs are written out + for handler in logger.handlers: + handler.flush() diff --git a/requirements.txt b/requirements.txt index 4a21e30..3684df7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,7 @@ langtrace-python-sdk==3.3.14 setuptools==75.6.0 pytest==8.3.4 pytest-asyncio==0.25.0 -nltk==3.9.1 \ No newline at end of file +nltk==3.9.1 +soundfile==0.12.1 +groq==0.13.1 +numpy==2.2.1 \ No newline at end of file diff --git a/voice_stt_mode.py b/voice_stt_mode.py new file mode 100644 index 0000000..d71abdf --- /dev/null +++ b/voice_stt_mode.py @@ -0,0 +1,509 @@ +""" +Airtime and Messaging Service using Africa's Talking API + +This script provides a Gradio-based web interface for sending airtime and messages +using the Africa's Talking API. It also tracks the carbon emissions of the operations +using the CodeCarbon library. + +The voice command interface allows users to send airtime, send messages, and search for +news articles using voice commands. However, the audio transcription and processing +is required since the model only accepts text inputs & has limited capabilities. + +Usage: + 1. Set the environment variables `AT_USERNAME`, `GROQ_API_KEY`, and `AT_API_KEY` with + your Africa's Talking credentials. + 2. Run the script: `python app.py` + 3. Access the Gradio web interface to send airtime or messages or search for news articles. + +Example: + Send airtime to a phone number: + - `Send airtime to +254712345678 with an amount of 10 in currency KES` + Send a message to a phone number: + - `Send a message to +254712345678 with the message 'Hello there', + using the username 'username'` + Search for news about a topic: + - `Latest news on climate change` +""" + +# ------------------------------------------------------------------------------------ +# Import Statements +# ------------------------------------------------------------------------------------ + +# Standard Library Imports +import os +import io +import json +import logging +from logging.handlers import RotatingFileHandler +import asyncio +from importlib.metadata import version, PackageNotFoundError + +# Third-Party Library Imports +import gradio as gr +from langtrace_python_sdk import langtrace, with_langtrace_root_span +import groq +import numpy as np +import soundfile as sf +import ollama + +# Local Module Imports +from utils.function_call import send_airtime, send_message, search_news + +# ------------------------------------------------------------------------------------ +# Logging Configuration +# ------------------------------------------------------------------------------------ + +# Initialize Langtrace +langtrace.init(api_key=os.getenv("LANGTRACE_API_KEY")) +groq_client = groq.Client(api_key=os.getenv("GROQ_API_KEY")) + + +# Set up the logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) # Set the logger to handle all levels DEBUG and above + +# Prevent logs from being propagated to the root logger +logger.propagate = False + +# Define logging format +formatter = logging.Formatter("%(asctime)s:%(name)s:%(levelname)s:%(message)s") + +# Set up the file handler for logging to a file +file_handler = RotatingFileHandler( + "voice_stt_mode.log", maxBytes=5 * 1024 * 1024, backupCount=5 +) +file_handler.setLevel(logging.INFO) # Capture INFO and above in the file +file_handler.setFormatter(formatter) +logger.addHandler(file_handler) + +# Set up the stream handler for console output +stream_handler = logging.StreamHandler() +stream_handler.setLevel(logging.DEBUG) # Capture DEBUG and above in the console +stream_handler.setFormatter(formatter) +logger.addHandler(stream_handler) + +# ------------------------------------------------------------------------------------ +# Log the Start of the Script +# ------------------------------------------------------------------------------------ + +logger.info( + "Starting the voice&text function calling script to send airtime and messages using the " + "Africa's Talking API" +) +logger.info("Review GROQ Speech-to-Text if they log the audio data or not") +logger.info("Let's review the packages and their versions") + +# ------------------------------------------------------------------------------------ +# Log Versions of the Libraries +# ------------------------------------------------------------------------------------ + +pkgs = [ + "africastalking", + "ollama", + "duckduckgo_search", + "langtrace-python-sdk", + "gradio", + "groq", + "soundfile", + "numpy", +] + +for pkg in pkgs: + try: + pkg_version = version(pkg) + logger.info("%s version: %s", pkg, pkg_version) + except PackageNotFoundError: + logger.error("Package %s is not installed.", pkg) + except Exception as e: + logger.error("Failed to retrieve version for %s: %s", pkg, str(e)) + +# ------------------------------------------------------------------------------------ +# Define Tools Schema +# ------------------------------------------------------------------------------------ + +tools = [ + { + "type": "function", + "function": { + "name": "send_airtime", + "description": "Send airtime to a phone number using the Africa's Talking API", + "parameters": { + "type": "object", + "properties": { + "phone_number": { + "type": "string", + "description": "The phone number in international format", + }, + "currency_code": { + "type": "string", + "description": "The 3-letter ISO currency code", + }, + "amount": { + "type": "string", + "description": "The amount of airtime to send", + }, + }, + "required": ["phone_number", "currency_code", "amount"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "send_message", + "description": "Send a message to a phone number using the Africa's Talking API", + "parameters": { + "type": "object", + "properties": { + "phone_number": { + "type": "string", + "description": "The phone number in international format", + }, + "message": { + "type": "string", + "description": "The message to send", + }, + "username": { + "type": "string", + "description": "The username for the Africa's Talking account", + }, + }, + "required": ["phone_number", "message", "username"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "search_news", + "description": "Search for news articles using DuckDuckGo News API", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query for news articles", + }, + "max_results": { + "type": "integer", + "description": "The maximum number of news articles to retrieve", + "default": 5, + }, + }, + "required": ["query"], + }, + }, + }, +] + +# ------------------------------------------------------------------------------------ +# Function Definitions +# ------------------------------------------------------------------------------------ + + +@with_langtrace_root_span() +async def process_user_message(message: str, history: list) -> str: + """ + Handle the conversation with the model asynchronously. + + Parameters + ---------- + message : str + The user's input message. + history : list of list of str + The conversation history up to that point. + + Returns + ------- + str + The model's response or the function execution result. + """ + logger.info("Processing user message: %s", message) + client = ollama.AsyncClient() + + messages = [ + { + "role": "user", + "content": message, + } + ] + + try: + response = await client.chat( + model="qwen2.5:0.5b", + messages=messages, + tools=tools, + ) + except Exception as e: + logger.exception("Failed to get response from Ollama client.") + return "An unexpected error occurred while communicating with the assistant." + + model_message = response.get("message", {}) + model_content = model_message.get("content", "") + model_role = model_message.get("role", "assistant") + logger.info("Model response: %s", model_content) + + messages.append( + { + "role": model_role, + "content": model_content, + } + ) + logger.debug("Model response details: %s", response.get("message")) + + if model_message.get("tool_calls"): + for tool in model_message["tool_calls"]: + tool_name = tool["function"]["name"] + arguments = tool["function"]["arguments"] + logger.info("Tool call detected: %s", tool_name) + + try: + if tool_name == "send_airtime": + logger.info("Calling send_airtime with arguments: %s", arguments) + function_response = send_airtime( + arguments["phone_number"], + arguments["currency_code"], + arguments["amount"], + ) + elif tool_name == "send_message": + logger.info("Calling send_message with arguments: %s", arguments) + function_response = send_message( + arguments["phone_number"], + arguments["message"], + arguments["username"], + ) + elif tool_name == "search_news": + logger.info("Calling search_news with arguments: %s", arguments) + function_response = search_news(arguments["query"]) + else: + function_response = json.dumps({"error": "Unknown function"}) + logger.warning("Unknown function called: %s", tool_name) + + logger.debug("Function response: %s", function_response) + messages.append( + { + "role": "tool", + "content": function_response, + } + ) + + return f"Function `{tool_name}` executed successfully. Response:\n{function_response}" + except ( + send_airtime.ErrorType, + send_message.ErrorType, + search_news.ErrorType, + ) as e: + logger.error("Handled error in tool `%s`: %s", tool_name, e) + return f"Error executing `{tool_name}`: {str(e)}" + except Exception as e: # pylint: disable=broad-exception-caught + logger.exception("Unexpected error in tool `%s`: %s", tool_name, e) + return f"An unexpected error occurred while executing `{tool_name}`." + else: + return model_content + + +# Add error handling for audio processing +async def process_audio_and_llm(audio): + """ + Process the audio recording and get the transcription using Groq. + + Parameters + ---------- + audio : tuple + The audio recording tuple with the sample rate and audio data. + + Returns + ------- + str + The transcription and LLM response. + + Raises + ------ + Exception + If there is an error in processing the audio. + """ + if audio is None: + return "Error: No audio recorded. Please try again." + try: + sr, y = audio + if len(y) == 0: + return "Error: Empty audio recording. Please speak and try again." + # Convert to mono if stereo + if y.ndim > 1: + y = y.mean(axis=1) + + # Normalize audio + y = y.astype(np.float32) + y /= np.max(np.abs(y)) + + # Write audio to buffer + buffer = io.BytesIO() + sf.write(buffer, y, sr, format="wav") + buffer.seek(0) + + try: + # Get transcription from Groq + # add the import here then text will be cut out for the client + transcription = groq_client.audio.transcriptions.create( + model="distil-whisper-large-v3-en", + file=("audio.wav", buffer), + response_format="text", + ) + + # Process transcription with LLM + response = await process_user_message(transcription, []) + return f"Transcription: {transcription}\nLLM Response: {response}" + + except Exception as e: + logger.exception("Error during transcription or LLM processing: %s", e) + return f"Error: {str(e)}" + except Exception as e: + logger.exception("Error in audio processing: %s", e) + return f"Error: {str(e)}" + + +def gradio_interface(message: str, history: list) -> str: + """ + Gradio interface function to process user messages and track emissions. + + Parameters + ---------- + message : str + The user's input message. + history : list of list of str + The conversation history up to that point. + + Returns + ------- + str + The model's response or the function execution result. + """ + try: + response = asyncio.run(process_user_message(message, history)) + return response + except Exception as e: # pylint: disable=broad-exception-caught + logger.exception("Error in gradio_interface: %s", e) + return "An unexpected error occurred while processing your message." + + +# ------------------------------------------------------------------------------------ +# Create Gradio Interface with Both Text and Audio Inputs +# ------------------------------------------------------------------------------------ + +with gr.Blocks(title="🎙️ Voice Command Communication Interface 🌍") as demo: + gr.Markdown("# Voice Command & Text Communication Interface") + + # Add tabs for voice and text input + with gr.Tab("Voice Input"): + # How to use + gr.Markdown( + """ +This interface allows you to send airtime, messages, and search +for news articles using voice commands. +You can also type your commands in the text input tab. +Here are some examples of commands you can use: +- Send airtime to +254712345678 with an amount of 10 in currency KES 📞 +- Send a message to +254712345678 with the message 'Hello there' with + the username 'add your username'💬 +- Search news for 'latest technology trends' 📰 +* Please speak clearly and concisely for accurate transcription. In English only for now. +* You can also edit the transcription before processing. We all make mistakes! 🤗 +""" + ) + audio_input = gr.Audio( + sources=["microphone", "upload"], + type="numpy", + label="Speak your command", + streaming=False, + ) + transcription_preview = gr.Textbox( + label="Preview Transcription (Edit if needed)", + interactive=True, + placeholder="Transcription will appear here first...", + ) + audio_output = gr.Textbox( + label="Final Result", placeholder="LLM response will appear here..." + ) + with gr.Row(): + transcribe_button = gr.Button("Transcribe") + process_button = gr.Button("Process Edited Text", variant="primary") + + def show_transcription(audio): + """ + Transcribe the audio recording and show the preview. + + Parameters + ---------- + audio : tuple + The audio recording tuple with the sample rate and audio data. + + Returns + ------- + str + The transcription of the audio recording. + """ + try: + if audio is None: + return "Error: No audio recorded. Please try again." + sr, y = audio + if len(y) == 0: + return "Error: Empty audio recording. Please speak and try again." + + # Convert to mono if stereo + if y.ndim > 1: + y = y.mean(axis=1) + + # Normalize audio + y = y.astype(np.float32) + y /= np.max(np.abs(y)) + + # Write audio to buffer + buffer = io.BytesIO() + sf.write(buffer, y, sr, format="wav") + buffer.seek(0) + + # Get transcription from Groq + transcription = groq_client.audio.transcriptions.create( + model="distil-whisper-large-v3-en", + file=("audio.wav", buffer), + response_format="text", + ) + logger.info("Audio transcribed successfully: %s", transcription) + return transcription + except Exception as e: + logger.exception("Error during transcription: %s", e) + return f"Error: {str(e)}" + + # Wire up the components + transcribe_button.click( + fn=show_transcription, inputs=audio_input, outputs=transcription_preview + ) + + # Process the edited text + process_button.click( + fn=lambda x: asyncio.run(process_user_message(x, [])), + inputs=transcription_preview, + outputs=audio_output, + ) + + # Text input tab + with gr.Tab("Text Input"): + chat_interface = gr.ChatInterface( + fn=gradio_interface, + description=( + "Type your commands or use voice input above:\n" + "- Send airtime to +254712345678 with an amount of 10 in currency KES 📞\n" + "- Send a message to +254712345678 with the message 'Hello there' 💬\n" + "- Search news for 'latest technology trends' 📰" + ), + type="messages", + ) + +if __name__ == "__main__": + try: + logger.info("Launching Gradio interface...") + demo.launch(inbrowser=True, server_name="0.0.0.0", server_port=7860) + logger.info("Gradio interface launched successfully.") + except Exception as e: + logger.exception("Failed to launch Gradio interface: %s", e) + logger.info("Script execution completed")