Merge pull request #4 from Shuyib/evals

Shuyib · web-flow · commit 384d43407941 · 2024-12-20T09:19:33.000+03:00
Evals
diff --git a/Makefile b/Makefile
@@ -43,7 +43,7 @@ docstring: activate
 
 format: activate 
 	# format code
-	black utils/*.py *.py
+	black utils/*.py tests/*.py
 
 clean:
 	# clean directory of cache
@@ -61,6 +61,7 @@ clean:
 	rm -rf utils/__pycache__
 	rm -rf utils/*.log
 	rm -rf *.log
+	rm -rf tests/__pycache__
 
 lint: activate install 
 	#flake8 or #pylint
@@ -69,6 +70,14 @@ lint: activate install
 	# C - convention
 	pylint --disable=R,C --errors-only *.py 
 
+test: activate install
+	# run tests
+	echo @running tests
+	echo @we used this signature to run tests: $(PYTHON) -m pytest tests/testcases.py
+	echo @for single tests, we used this signature: $(PYTHON) -m pytest tests/testcases.py::test_function_name
+	$(PYTHON) -m pytest tests/test_cases.py -v
+	$(PYTHON) -m pytest tests/test_run.py -v --asyncio-mode=strict
+
 run: activate install format
 	# run test_app
 	# run each file separately, bc if one fails, all fail
diff --git a/README.md b/README.md
@@ -36,10 +36,15 @@ Learn more about tool calling <https://gorilla.cs.berkeley.edu/leaderboard.html>
 ├── README.md - This file contains the project documentation. This is the file you are currently reading.       
 ├── requirements.txt - This file contains the dependencies for the project.  
 ├── summary.png - How function calling works with a diagram.   
+├── tests - This directory contains the test files for the project.
+│   ├── __init__.py - This file initializes the tests directory as a package.     
+│   ├── test_cases.py - This file contains the test cases for the project.
+│   └── test_run.py - This file contains the code to run the test cases for the function calling LLM.    
 └── utils - This directory contains the utility files for the project.       
     ├── __init__.py - This file initializes the utils directory as a package.     
     ├── function_call.py - This file contains the code to call a function using LLMs.        
-    └── communication_apis.py - This file contains the code to do with communication apis & experiments.             
+    └── communication_apis.py - This file contains the code to do with communication apis & experiments.       
+      
     
 ## Installation
 The project uses python 3.12. To install the project, follow the steps below:    
@@ -113,6 +118,9 @@ Notes:
 echo "AT_API_KEY = yourapikey" >> .env
 echo "AT_USERNAME = yourusername" >> .env
 echo "LANGTRACE_API_KEY= yourlangtraceapikey" >> .env  
+echo "TEST_PHONE_NUMBER = yourphonenumber" >> .env
+echo "TEST_PHONE_NUMBER_2 = yourphonenumber" >> .env
+echo "TEST_PHONE_NUMBER_3 = yourphonenumber" >> .env
 ```
 - The Dockerfile creates 2 images for the ollama server and the gradio dashboard. The ollama server is running on port 11434 and the gradio dashboard is running on port 7860 . You can access the gradio dashboard by visiting <http://localhost:7860> in your browser & the ollama server by visiting <http://localhost:11434> in your browser. They consume about 2.72GB of storage in the container.       
 - The docker-compose.yml file is used to run the ollama server and the gradio dashboard. The docker-compose-codecarbon.yml file is used to run the ollama server, the gradio dashboard and the codecarbon project.
@@ -141,6 +149,10 @@ ollama run qwen2.5:0.5b
 ```bash
 export AT_API_KEY=yourapikey
 export AT_USERNAME=yourusername
+export LANGTRACE_API_KEY=yourlangtraceapikey
+export TEST_PHONE_NUMBER=yourphonenumber
+export TEST_PHONE_NUMBER_2=yourphonenumber
+export TEST_PHONE_NUMBER_3=yourphonenumber
 ```
 - Continue running the installation steps in the terminal.    
 - Send your first message and airtime with an LLM. 🌠     
@@ -152,6 +164,14 @@ This project uses LLMs to send airtime to a phone number. The difference is that
 - Send airtime to xxxxxxxxxx046 and xxxxxxxxxx524 with an amount of 10 in currency KES.   
 - Send a message to xxxxxxxxxx046 and xxxxxxxxxx524 with a message "Hello, how are you?", using the username "username".
 
+### Responsible AI Practices
+This project implements several responsible AI practices:   
+- All test data is anonymized to protect privacy.      
+- Input validation to prevent misuse (negative amounts, spam detection).    
+- Handling of sensitive content and edge cases.       
+- Comprehensive test coverage for various scenarios.    
+- Secure handling of credentials and personal information.    
+
 ![Process Summary](summary.png)
 
 ## Use cases
@@ -164,5 +184,12 @@ This project uses LLMs to send airtime to a phone number. The difference is that
 ## Contributing
 Contributions are welcome. If you would like to contribute to the project, you can fork the repository, create a new branch, make your changes and then create a pull request.
 
+### Testing Guidelines
+When contributing, please ensure:
+- All test data uses anonymized placeholders
+- Edge cases and invalid inputs are properly tested
+- Sensitive content handling is verified
+- No real personal information is included in tests
+
 ## License
 [License information](https://github.com/Shuyib/tool_calling_api/blob/main/LICENSE).
diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,5 @@ gradio==5.7.1
 duckduckgo_search==6.3.2
 langtrace-python-sdk==3.3.14
 setuptools==75.6.0
+pytest==8.3.4
+pytest-asyncio==0.25.0
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_cases.py b/tests/test_cases.py
@@ -0,0 +1,131 @@
+"""
+Unit tests for the function calling utilities.
+
+This module contains tests for sending airtime, sending messages, and searching news
+using the Africa's Talking API and DuckDuckGo News API. The tests mock external
+dependencies to ensure isolation and reliability.
+"""
+
+import os
+import re
+from unittest.mock import patch
+from utils.function_call import send_airtime, send_message, search_news
+
+# Load environment variables: TEST_PHONE_NUMBER
+PHONE_NUMBER = os.getenv("TEST_PHONE_NUMBER")
+
+
+@patch("utils.function_call.africastalking.Airtime")
+def test_send_airtime_success(mock_airtime):
+    """
+    Test the send_airtime function to ensure it successfully sends airtime.
+
+    This test mocks the Africa's Talking Airtime API and verifies that the
+    send_airtime function returns a response containing the word 'Sent'.
+
+    Parameters
+    ----------
+    mock_airtime : MagicMock
+        Mocked Airtime API from Africa's Talking.
+    """
+    # Configure the mock Airtime response
+    mock_airtime.return_value.send.return_value = {
+        "numSent": 1,
+        "responses": [{"status": "Sent"}],
+    }
+
+    # Call the send_airtime function
+    result = send_airtime(PHONE_NUMBER, "KES", 5)
+
+    # Define patterns to check in the response
+    message_patterns = [
+        r"Sent",
+    ]
+
+    # Assert each pattern is found in the response
+    for pattern in message_patterns:
+        assert re.search(
+            pattern, str(result)
+        ), f"Pattern '{pattern}' not found in response"
+
+
+@patch("utils.function_call.africastalking.SMS")
+def test_send_message_success(mock_sms):
+    """
+    Test the send_message function to ensure it successfully sends a message.
+
+    This test mocks the Africa's Talking SMS API and verifies that the
+    send_message function returns a response containing 'Sent to 1/1'.
+
+    Parameters
+    ----------
+    mock_sms : MagicMock
+        Mocked SMS API from Africa's Talking.
+    """
+    # Configure the mock SMS response
+    mock_sms.return_value.send.return_value = {
+        "SMSMessageData": {"Message": "Sent to 1/1"}
+    }
+
+    # Call the send_message function
+    result = send_message(PHONE_NUMBER, "In Qwen, we trust", os.getenv("AT_USERNAME"))
+
+    # Define patterns to check in the response
+    message_patterns = [r"Sent to 1/1"]
+
+    # Assert each pattern is found in the response
+    for pattern in message_patterns:
+        assert re.search(
+            pattern, str(result)
+        ), f"Pattern '{pattern}' not found in response"
+
+
+@patch("utils.function_call.DDGS")
+def test_search_news_success(mock_ddgs):
+    """
+    Test the search_news function to ensure it retrieves news articles correctly.
+
+    This test mocks the DuckDuckGo News API and verifies that the
+    search_news function returns results matching the expected patterns.
+
+    Parameters
+    ----------
+    mock_ddgs : MagicMock
+        Mocked DuckDuckGo DDGS API.
+    """
+    # Configure the mock DDGS response with a realistic news article
+    mock_ddgs.return_value.news.return_value = [
+        {
+            "date": "2024-12-20T02:07:00+00:00",
+            "title": "Hedge fund leader loves this AI stock",
+            "body": "Sample article body text",
+            "url": "https://example.com/article",
+            "image": "https://example.com/image.jpg",
+            "source": "MSN",
+        }
+    ]
+
+    # Call the search_news function
+    result = search_news("AI")
+
+    # Define regex patterns to validate response format
+    patterns = [
+        r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{2}:\d{2}",  # Date format
+        r'"title":\s*"[^"]+?"',  # Title field
+        r'"source":\s*"[^"]+?"',  # Source field
+        r'https?://[^\s<>"]+?',  # URL format
+    ]
+
+    # Convert result to string for regex matching
+    result_str = str(result)
+
+    # Assert all patterns match in the result
+    for pattern in patterns:
+        assert re.search(
+            pattern, result_str
+        ), f"Pattern '{pattern}' not found in response"
+
+    # Verify that the news method was called with expected arguments
+    mock_ddgs.return_value.news.assert_called_once_with(
+        keywords="AI", region="wt-wt", safesearch="off", timelimit="d", max_results=5
+    )
diff --git a/tests/test_run.py b/tests/test_run.py