From 4fa237d86230c4fbbb477d838657a55ab36d9703 Mon Sep 17 00:00:00 2001 From: Matt Dancho Date: Tue, 31 Dec 2024 11:25:19 -0500 Subject: [PATCH] sql agent tutorial --- .../agents/data_cleaning_agent.py | 2 + .../agents/data_wrangling_agent.py | 2 + .../agents/feature_engineering_agent.py | 2 + .../agents/sql_database_agent.py | 3 +- examples/sql_database_agent.ipynb | 704 ++++++++++++++++++ 5 files changed, 712 insertions(+), 1 deletion(-) create mode 100644 examples/sql_database_agent.ipynb diff --git a/ai_data_science_team/agents/data_cleaning_agent.py b/ai_data_science_team/agents/data_cleaning_agent.py index fff5112..45908cd 100644 --- a/ai_data_science_team/agents/data_cleaning_agent.py +++ b/ai_data_science_team/agents/data_cleaning_agent.py @@ -201,6 +201,8 @@ def recommend_cleaning_steps(state: GraphState): } def create_data_cleaner_code(state: GraphState): + if bypass_recommended_steps: + print("---DATA CLEANING AGENT----") print(" * CREATE DATA CLEANER CODE") data_cleaning_prompt = PromptTemplate( diff --git a/ai_data_science_team/agents/data_wrangling_agent.py b/ai_data_science_team/agents/data_wrangling_agent.py index 7263906..6e17102 100644 --- a/ai_data_science_team/agents/data_wrangling_agent.py +++ b/ai_data_science_team/agents/data_wrangling_agent.py @@ -194,6 +194,8 @@ def recommend_wrangling_steps(state: GraphState): def create_data_wrangler_code(state: GraphState): + if bypass_recommended_steps: + print("---DATA WRANGLING AGENT----") print(" * CREATE DATA WRANGLER CODE") data_wrangling_prompt = PromptTemplate( diff --git a/ai_data_science_team/agents/feature_engineering_agent.py b/ai_data_science_team/agents/feature_engineering_agent.py index 570d86c..d0df800 100644 --- a/ai_data_science_team/agents/feature_engineering_agent.py +++ b/ai_data_science_team/agents/feature_engineering_agent.py @@ -216,6 +216,8 @@ def human_review(state: GraphState) -> Command[Literal["recommend_feature_engine ) def create_feature_engineering_code(state: GraphState): + if bypass_recommended_steps: + print("---FEATURE ENGINEERING AGENT----") print(" * CREATE FEATURE ENGINEERING CODE") feature_engineering_prompt = PromptTemplate( diff --git a/ai_data_science_team/agents/sql_database_agent.py b/ai_data_science_team/agents/sql_database_agent.py index b5c161f..9610dec 100644 --- a/ai_data_science_team/agents/sql_database_agent.py +++ b/ai_data_science_team/agents/sql_database_agent.py @@ -182,7 +182,8 @@ def recommend_sql_steps(state: GraphState): } def create_sql_query_code(state: GraphState): - + if bypass_recommended_steps: + print("---SQL DATABASE AGENT---") print(" * CREATE SQL QUERY CODE") # Prompt to get the SQL code from the LLM diff --git a/examples/sql_database_agent.ipynb b/examples/sql_database_agent.ipynb new file mode 100644 index 0000000..12773ac --- /dev/null +++ b/examples/sql_database_agent.ipynb @@ -0,0 +1,704 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to Automate SQL Database Queries with AI\n", + "\n", + "### Free Generative AI Data Science Workshop\n", + "\n", + "If you want to learn how to build AI Agents that perform Data Science, Business Intelligence, Churn Modeling, Time Series Forecasting, and more, [register for my next free AI for Data Scientists workshop here.](https://learn.business-science.io/ai-register)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "bat" + } + }, + "source": [ + "# Table of Contents\n", + "\n", + "1. [Introduction](#introduction)\n", + "2. [Load Libraries](#load-libraries)\n", + "3. [Setup AI and Logging](#setup-ai-and-logging)\n", + "4. [Connect to a SQL Database](#connect-to-a-sql-database)\n", + "5. [Create The Agent](#create-the-agent)\n", + "6. [Run the Agent](#run-the-agent)\n", + "7. [Response](#response)\n", + " 1. [SQL Query Code](#sql-query-code)\n", + " 2. [Pandas Data Frame From SQL Query](#pandas-data-frame-from-sql-query)\n", + " 3. [Python Pipeline Function](#python-pipeline-function)\n", + " 4. [Storage Location](#storage-location)\n", + "8. [Free Generative AI Data Science Workshop](#free-generative-ai-data-science-workshop)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# * Libraries\n", + "\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "import pandas as pd\n", + "import sqlalchemy as sql\n", + "\n", + "import os\n", + "import yaml\n", + "from pprint import pprint\n", + "\n", + "from ai_data_science_team.agents import make_sql_database_agent" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup AI and Logging\n", + "\n", + "This section of code sets up the LLM inputs and the logging information. Logging is used to store AI-generated code and files during the AI Data Science Teams processing of files. \n", + "\n", + "*Important Note:* This example uses OpenAI's API. But any LLM can be used such as Anthropic or local LLMs with Ollama." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ChatOpenAI(client=, async_client=, root_client=, root_async_client=, model_name='gpt-4o-mini', model_kwargs={}, openai_api_key=SecretStr('**********'))" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# * Setup\n", + "\n", + "MODEL = \"gpt-4o-mini\"\n", + "LOG = True\n", + "LOG_PATH = os.path.join(os.getcwd(), \"logs/\")\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = yaml.safe_load(open('../credentials.yml'))['openai']\n", + "\n", + "llm = ChatOpenAI(model = MODEL)\n", + "\n", + "llm\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Connect to a SQL Database\n", + "\n", + "Next, let's connect to the leads data from a SQL database. We will need to use a `sqlalchemy` connection to use the SQL Database Agent." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql_engine = sql.create_engine(\"sqlite:///data/leads_scored.db\")\n", + "\n", + "conn = sql_engine.connect()\n", + "\n", + "conn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create The Agent\n", + "\n", + "Run this code to create the agent with `make_sql_database_agent()`.\n", + "\n", + "The only required parameters are `model` and `connection`. \n", + "\n", + "- `model` is the LLM model that you want to use.\n", + "- `connection` is the sqlalchemy connection to the SQL database.\n", + "\n", + "Other parameters that I've included in this demo are:\n", + "\n", + "- `log`: Set up logging the SQL DB agent's pipeline function\n", + "- `log_path`: The directory to save the SQL DB agent's pipeline function\n", + "- `bypass_explain_code`: Dynamically bypass the explain code step in the LangGraph DAG. This is useful to speed up response time when you don't need to see the explain code.\n", + "- `bypass_recommended_steps`: Dynamically bypass the recommended steps in the LangGraph DAG. This is useful to speed up response time when you don't need to see the recommended steps." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create the agent\n", + "sql_agent = make_sql_database_agent(\n", + " model = llm, \n", + " connection=conn, \n", + " log=LOG, \n", + " log_path=LOG_PATH,\n", + " bypass_explain_code=True,\n", + " bypass_recommended_steps=True,\n", + ")\n", + "\n", + "sql_agent" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run the Agent" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The main inputs to the SQL DB Agent are:\n", + "\n", + "- **user_instructions**: What actions you'd like to take on the SQL database query. \n", + "- **max_retries**: Used to limit the number of attempts to fix the SQL and Python code generated by the agent. Set this to 3 to limit to 3 attempts. \n", + "- **retry_count**: Set this to 0. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---SQL DATABASE AGENT---\n", + " * CREATE SQL QUERY CODE\n", + " * CREATE PYTHON FUNCTION TO RUN SQL CODE\n", + " File saved to: /Users/mdancho/Desktop/course_code/ai-data-science-team/logs/sql_database.py\n", + " * EXECUTING AGENT CODE ON SQL CONNECTION\n" + ] + } + ], + "source": [ + "\n", + "response = sql_agent.invoke({\n", + " \"user_instructions\": \"Aggregate the product transactions by Product Description. Use suggested price and a quantity of 1 to approximate sales.\",\n", + " \"max_retries\":3, \n", + " \"retry_count\":0\n", + "})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Response\n", + "\n", + "The response produced contains everything we need to understand the data cleaning decisions made and get the cleaned dataset. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['messages',\n", + " 'user_instructions',\n", + " 'data_sql',\n", + " 'sql_query_code',\n", + " 'sql_database_function',\n", + " 'sql_database_function_path',\n", + " 'sql_database_function_name',\n", + " 'sql_database_error',\n", + " 'max_retries',\n", + " 'retry_count']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(response.keys())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### SQL Query Code" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('SELECT p.description, \\n'\n", + " ' SUM(p.suggested_price) AS total_sales\\n'\n", + " 'FROM products p\\n'\n", + " 'JOIN transactions t ON p.product_id = t.product_id\\n'\n", + " 'GROUP BY p.description;')\n" + ] + } + ], + "source": [ + "pprint(response['sql_query_code'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Pandas Data Frame From SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
descriptiontotal_sales
04-Course Bundle - Machine Learning + Expert We...95216.666667
14-Course Bundle - Machine Learning + Expert We...4858.333333
24-Course Bundle - Machine Learning + Expert We...222500.000000
35 Course Bundle - Machine Learning + Web Apps ...1650.000000
45 Course Bundle - Machine Learning + Web Apps ...5638.888889
55 Course Bundle - Machine Learning + Web Apps ...2625.000000
65 Course Bundle - Machine Learning + Web Apps ...114000.000000
7Bundle - DS For Business + Web Apps (Level 1):...25666.666667
8Bundle - DS For Business + Web Apps (Level 1):...1600.000000
9Bundle - DS For Business + Web Apps (Level 1):...11825.000000
10Bundle - DS For Business + Web Apps (Level 1):...57000.000000
11Bundle - Data Science Starter Kit: R-Track - C...2600.000000
12Bundle - Machine Learning For Business: R-Trac...6066.666667
13Bundle - Machine Learning For Business: R-Trac...50600.000000
14DS4B 101-R: Business Analysis With R - 3 Low M...9200.000000
15DS4B 101-R: Business Analysis With R - 6 Low M...4700.000000
16DS4B 101-R: Business Analysis With R - Monthly...2600.000000
17DS4B 101-R: Business Analysis With R - Paid Co...43200.000000
18DS4B 101-R: Business Analysis With R - Single ...73200.000000
19DS4B 102-R: Shiny Web Applications (Intermedia...6200.000000
20DS4B 102-R: Shiny Web Applications (Intermedia...60000.000000
21DS4B 201-R: Data Science For Business With R -...2400.000000
22DS4B 201-R: Data Science For Business With R -...1500.000000
23DS4B 201-R: Data Science For Business With R -...7000.000000
24DS4B 201-R: Data Science For Business With R -...12600.000000
25DS4B 201-R: Data Science For Business With R -...88900.000000
26DS4B 201-R: Data Science For Business With R -...35000.000000
27DS4B 202A-R: Shiny Developer with AWS - 6 Low ...4650.000000
28DS4B 202A-R: Shiny Developer with AWS - Paid C...89600.000000
29DS4B 203-R: High-Performance Time Series Forec...2000.000000
30DS4B 203-R: High-Performance Time Series Forec...6716.666667
31DS4B 203-R: High-Performance Time Series Forec...112000.000000
32Jumpstart with R - Get Jumpstarted!225.000000
33Learning Labs Pro - 6-month Payment Option1200.000000
34Learning Labs Pro - Low Monthly Payments10290.000000
35Learning Labs Pro - Paid Course105644.000000
36Learning Labs Pro - Subscribe to LL PRO 6-Mont...975.000000
37Learning Labs Pro - Subscribe to LL PRO Annual...9000.000000
38Learning Labs Pro - Subscribe to LL PRO Monthl...5341.000000
39Learning Labs Pro - Yearly Membership14400.000000
40Learning Labs Pro - Yearly Plan90000.000000
\n", + "
" + ], + "text/plain": [ + " description total_sales\n", + "0 4-Course Bundle - Machine Learning + Expert We... 95216.666667\n", + "1 4-Course Bundle - Machine Learning + Expert We... 4858.333333\n", + "2 4-Course Bundle - Machine Learning + Expert We... 222500.000000\n", + "3 5 Course Bundle - Machine Learning + Web Apps ... 1650.000000\n", + "4 5 Course Bundle - Machine Learning + Web Apps ... 5638.888889\n", + "5 5 Course Bundle - Machine Learning + Web Apps ... 2625.000000\n", + "6 5 Course Bundle - Machine Learning + Web Apps ... 114000.000000\n", + "7 Bundle - DS For Business + Web Apps (Level 1):... 25666.666667\n", + "8 Bundle - DS For Business + Web Apps (Level 1):... 1600.000000\n", + "9 Bundle - DS For Business + Web Apps (Level 1):... 11825.000000\n", + "10 Bundle - DS For Business + Web Apps (Level 1):... 57000.000000\n", + "11 Bundle - Data Science Starter Kit: R-Track - C... 2600.000000\n", + "12 Bundle - Machine Learning For Business: R-Trac... 6066.666667\n", + "13 Bundle - Machine Learning For Business: R-Trac... 50600.000000\n", + "14 DS4B 101-R: Business Analysis With R - 3 Low M... 9200.000000\n", + "15 DS4B 101-R: Business Analysis With R - 6 Low M... 4700.000000\n", + "16 DS4B 101-R: Business Analysis With R - Monthly... 2600.000000\n", + "17 DS4B 101-R: Business Analysis With R - Paid Co... 43200.000000\n", + "18 DS4B 101-R: Business Analysis With R - Single ... 73200.000000\n", + "19 DS4B 102-R: Shiny Web Applications (Intermedia... 6200.000000\n", + "20 DS4B 102-R: Shiny Web Applications (Intermedia... 60000.000000\n", + "21 DS4B 201-R: Data Science For Business With R -... 2400.000000\n", + "22 DS4B 201-R: Data Science For Business With R -... 1500.000000\n", + "23 DS4B 201-R: Data Science For Business With R -... 7000.000000\n", + "24 DS4B 201-R: Data Science For Business With R -... 12600.000000\n", + "25 DS4B 201-R: Data Science For Business With R -... 88900.000000\n", + "26 DS4B 201-R: Data Science For Business With R -... 35000.000000\n", + "27 DS4B 202A-R: Shiny Developer with AWS - 6 Low ... 4650.000000\n", + "28 DS4B 202A-R: Shiny Developer with AWS - Paid C... 89600.000000\n", + "29 DS4B 203-R: High-Performance Time Series Forec... 2000.000000\n", + "30 DS4B 203-R: High-Performance Time Series Forec... 6716.666667\n", + "31 DS4B 203-R: High-Performance Time Series Forec... 112000.000000\n", + "32 Jumpstart with R - Get Jumpstarted! 225.000000\n", + "33 Learning Labs Pro - 6-month Payment Option 1200.000000\n", + "34 Learning Labs Pro - Low Monthly Payments 10290.000000\n", + "35 Learning Labs Pro - Paid Course 105644.000000\n", + "36 Learning Labs Pro - Subscribe to LL PRO 6-Mont... 975.000000\n", + "37 Learning Labs Pro - Subscribe to LL PRO Annual... 9000.000000\n", + "38 Learning Labs Pro - Subscribe to LL PRO Monthl... 5341.000000\n", + "39 Learning Labs Pro - Yearly Membership 14400.000000\n", + "40 Learning Labs Pro - Yearly Plan 90000.000000" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(response['data_sql'])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Python Pipeline Function" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('# Disclaimer: This function was generated by AI. Please review before '\n", + " 'using.\\n'\n", + " '# Agent Name: sql_database_agent\\n'\n", + " '# Time Created: 2024-12-31 11:20:18\\n'\n", + " '\\n'\n", + " '\\n'\n", + " 'def sql_database_pipeline(connection):\\n'\n", + " ' import pandas as pd\\n'\n", + " ' import sqlalchemy as sql\\n'\n", + " ' \\n'\n", + " ' # Create a connection if needed\\n'\n", + " ' is_engine = isinstance(connection, sql.engine.base.Engine)\\n'\n", + " ' conn = connection.connect() if is_engine else connection\\n'\n", + " '\\n'\n", + " \" sql_query = '''\\n\"\n", + " ' SELECT p.description, \\n'\n", + " ' SUM(p.suggested_price) AS total_sales\\n'\n", + " 'FROM products p\\n'\n", + " 'JOIN transactions t ON p.product_id = t.product_id\\n'\n", + " 'GROUP BY p.description;\\n'\n", + " \" '''\\n\"\n", + " ' \\n'\n", + " ' return pd.read_sql(sql_query, connection)\\n'\n", + " ' ')\n" + ] + } + ], + "source": [ + "pprint(response['sql_database_function'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Storage location if you logged the pipeline function." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/mdancho/Desktop/course_code/ai-data-science-team/logs/sql_database.py\n" + ] + } + ], + "source": [ + "print(response['sql_database_function_path'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Free Generative AI Data Science Workshop\n", + "\n", + "If you want to learn how to build AI Agents that perform Data Science, Business Intelligence, Churn Modeling, Time Series Forecasting, and more, [register for my next free AI for Data Scientists workshop here.](https://learn.business-science.io/ai-register)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ds4b_301p_dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}