Skip to content

Commit

Permalink
add code for slides
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Feb 17, 2025
1 parent 7bbcc80 commit d29a9d6
Show file tree
Hide file tree
Showing 5 changed files with 235 additions and 17 deletions.
135 changes: 135 additions & 0 deletions tutorial/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
from langchain_core.language_models import BaseChatModel

# question = "What are the rat orthologs of human TP53?"

def load_chat_model(model: str) -> BaseChatModel:
provider, model_name = model.split("/", maxsplit=1)
if provider == "groq":
# https://python.langchain.com/docs/integrations/chat/groq/
from langchain_groq import ChatGroq
return ChatGroq(model_name=model_name, temperature=0)
if provider == "openai":
# https://python.langchain.com/docs/integrations/chat/openai/
from langchain_openai import ChatOpenAI
return ChatOpenAI(model_name=model_name, temperature=0)
if provider == "ollama":
# https://python.langchain.com/docs/integrations/chat/ollama/
from langchain_ollama import ChatOllama
return ChatOllama(model=model_name, temperature=0)
raise ValueError(f"Unknown provider: {provider}")

llm = load_chat_model("groq/llama-3.3-70b-versatile")
# llm = load_chat_model("openai/gpt-4o-mini")
# llm = load_chat_model("ollama/mistral")

from langchain_qdrant import QdrantVectorStore
from langchain_community.embeddings import FastEmbedEmbeddings

vectordb = QdrantVectorStore.from_existing_collection(
path="data/qdrant",
collection_name="sparql-docs",
embedding=FastEmbedEmbeddings(model_name="BAAI/bge-small-en-v1.5"),
)

retriever = vectordb.as_retriever()
number_of_docs_retrieved = 5

# retrieved_docs = retriever.invoke(question, k=number_of_docs_retrieved)

from qdrant_client.models import FieldCondition, Filter, MatchValue
from langchain_core.documents import Document

def retrieve_docs(question: str) -> list[Document]:
retrieved_docs = retriever.invoke(
question,
k=number_of_docs_retrieved,
filter=Filter(
must=[
FieldCondition(
key="metadata.doc_type",
match=MatchValue(value="SPARQL endpoints query examples"),
)
]
)
)
retrieved_docs += retriever.invoke(
question,
k=number_of_docs_retrieved,
filter=Filter(
must_not=[
FieldCondition(
key="metadata.doc_type",
match=MatchValue(value="SPARQL endpoints query examples"),
)
]
),
)
return retrieved_docs

# retrieved_docs = retrieve_docs(question)

# print(f"📚️ Retrieved {len(retrieved_docs)} documents")
# # print(retrieved_docs)
# for doc in retrieved_docs:
# print(f"{doc.metadata.get('doc_type')} - {doc.metadata.get('endpoint_url')} - {doc.page_content}")


from langchain_core.prompts import ChatPromptTemplate

def _format_doc(doc: Document) -> str:
"""Format a question/answer document to be provided as context to the model."""
doc_lang = (
"sparql" if "query" in doc.metadata.get("doc_type", "")
else "shex" if "schema" in doc.metadata.get("doc_type", "")
else ""
)
return f"<document>\n{doc.page_content} ({doc.metadata.get('endpoint_url')}):\n\n```{doc_lang}\n{doc.metadata.get('answer')}\n```\n</document>"


SYSTEM_PROMPT = """You are an assistant that helps users to write SPARQL queries.
Put the SPARQL query inside a markdown codeblock with the "sparql" language tag, and always add the URL of the endpoint on which the query should be executed in a comment at the start of the query inside the codeblocks.
Use the queries examples and classes shapes provided in the prompt to derive your answer, don't try to create a query from nothing and do not provide a generic query.
Try to always answer with one query, if the answer lies in different endpoints, provide a federated query.
And briefly explain the query.
Here is a list of documents (reference questions and query answers, classes schema) relevant to the user question that will help you answer the user question accurately:
{retrieved_docs}
"""
prompt_template = ChatPromptTemplate.from_messages([
("system", SYSTEM_PROMPT),
("placeholder", "{messages}"),
])

# formatted_docs = "\n".join(doc.page_content + "\n" + doc.metadata.get("answer") for doc in retrieved_docs)
# formatted_docs = f"<documents>\n{'\n'.join(_format_doc(doc) for doc in retrieved_docs)}\n</documents>"
# prompt_with_context = prompt_template.invoke({
# "messages": [("human", question)],
# "retrieved_docs": formatted_docs,
# })

# print(str("\n".join(prompt_with_context.messages)))

# resp = llm.invoke("What are the rat orthologs of human TP53?")
# print(resp)

# for msg in llm.stream(prompt_with_context):
# print(msg.content, end='')


import chainlit as cl

@cl.on_message
async def on_message(msg: cl.Message):
retrieved_docs = retrieve_docs(msg.content)
formatted_docs = f"<documents>\n{'\n'.join(_format_doc(doc) for doc in retrieved_docs)}\n</documents>"
async with cl.Step(name=f"{len(retrieved_docs)} relevant documents") as step:
# step.input = msg.content
step.output = formatted_docs

prompt_with_context = prompt_template.invoke({
"messages": [("human", msg.content)],
"retrieved_docs": formatted_docs,
})
final_answer = cl.Message(content="")
for resp in llm.stream(prompt_with_context):
await final_answer.stream_token(resp.content)
await final_answer.send()
56 changes: 56 additions & 0 deletions tutorial/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os

from langchain_qdrant import QdrantVectorStore
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain_core.documents import Document
from sparql_llm import SparqlExamplesLoader, SparqlVoidShapesLoader


# List of endpoints that will be used
endpoints: list[dict[str, str]] = [
{
# The URL of the SPARQL endpoint from which most informations will be extracted
"endpoint_url": "https://sparql.uniprot.org/sparql/",
# If VoID description or SPARQL query examples are not available in the endpoint, you can provide a VoID file (local or remote URL)
"void_file": "../packages/sparql-llm/tests/void_uniprot.ttl",
# "examples_file": "uniprot_examples.ttl",
},
{
"endpoint_url": "https://www.bgee.org/sparql/",
},
{
"endpoint_url": "https://sparql.omabrowser.org/sparql/",
}
]


# Get documents from the SPARQL endpoints
docs: list[Document] = []
for endpoint in endpoints:
print(f"\n 🔎 Getting metadata for {endpoint['endpoint_url']}")
queries_loader = SparqlExamplesLoader(
endpoint["endpoint_url"],
examples_file=endpoint.get("examples_file"),
verbose=True,
)
docs += queries_loader.load()

void_loader = SparqlVoidShapesLoader(
endpoint["endpoint_url"],
void_file=endpoint.get("void_file"),
verbose=True,
)
docs += void_loader.load()

os.makedirs('data', exist_ok=True)

vectordb = QdrantVectorStore.from_documents(
docs,
path="data/qdrant",
collection_name="sparql-docs",
force_recreate=True,
embedding=FastEmbedEmbeddings(
model_name="BAAI/bge-small-en-v1.5",
# providers=["CUDAExecutionProvider"], # Uncomment this line to use your GPUs
),
)
19 changes: 19 additions & 0 deletions tutorial/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[project]
name = "tutorial-sparql-agent"
version = "0.0.1"
requires-python = "==3.12.*"

dependencies = [
"sparql-llm >=0.0.4",
"langchain >=0.3.14",
"langchain-community >=0.3.17",
"langchain-openai >=0.3.6",
"langchain-groq >=0.2.4",
"langchain-ollama >=0.2.3",
"langchain-qdrant >=0.2.0",
"qdrant-client >=1.13.0",
"fastembed >=0.5.1",
# "fastembed-gpu >=0.5.1", # Optional GPU support
"chainlit",
"langgraph >=0.2.73",
]
3 changes: 1 addition & 2 deletions tutorial/slides/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

<style>
/* Improve text size and codeblocks */
.reveal section p,pre code {
.reveal section p,li,pre code {
font-size: 0.6em;
}
.reveal section h2 {
Expand All @@ -55,7 +55,6 @@
}
.reveal section a code {
background: #8b743d;
color: #dcdcdc;
}
</style>
</body>
Expand Down
39 changes: 24 additions & 15 deletions tutorial/slides/public/slides.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
## Setup
## Introduction

In this tutorial, you'll learn how to build an LLM-powered app that assists in writing SPARQL queries, step by step.

As we progress, you'll be provided with code snippets to gradually construct the system. Note that some earlier code may need to be modified or removed to prevent redundancy, ensuring a clean and efficient implementation.

---

This tutorial will guide you through creating an LLM-based app step by step, you will be given pieces of code to build up the system, while we advance through the tutorial you will need to comment out or delete some previous pieces of code to avoid useless repetitions.
## Setup

[Install `uv`](https://docs.astral.sh/uv/getting-started/installation/) to easily handle dependencies and run scripts

Expand All @@ -13,6 +19,8 @@ GROQ_API_KEY=gsk_YYY
OPENAI_API_KEY=sk-proj-YYY
```

> You can get a [free API key on groq.com](https://console.groq.com/keys) after logging in with GitHub or Google. This gives you access to [various open-source models](https://groq.com/pricing/) with a limit of 6k tokens per minute.
---

## Setup dependencies
Expand All @@ -26,7 +34,6 @@ version = "0.0.1"
requires-python = "==3.12.*"
dependencies = [
"sparql-llm >=0.0.4",
"langgraph >=0.2.73",
"langchain >=0.3.14",
"langchain-community >=0.3.17",
"langchain-openai >=0.3.6",
Expand All @@ -36,6 +43,7 @@ dependencies = [
"qdrant-client >=1.13.0",
"fastembed >=0.5.1",
"chainlit >=2.2.1",
"langgraph >=0.2.73",
]
```

Expand Down Expand Up @@ -205,6 +213,8 @@ vectordb = QdrantVectorStore.from_documents(
)
```

> To use GPU you will need to replace the `fastembed` dependency with `fastembed-gpu`
---

## Provide context to the LLM
Expand Down Expand Up @@ -257,8 +267,6 @@ prompt_with_context = prompt_template.invoke({
"messages": [("human", question)],
"retrieved_docs": formatted_docs,
})


```

---
Expand All @@ -272,11 +280,11 @@ from langchain_core.documents import Document

def _format_doc(doc: Document) -> str:
"""Format our question/answer document to be provided as context to the model."""
doc_lang = ""
if "query" in doc.metadata.get("doc_type", ""):
doc_lang = "sparql"
elif "schema" in doc.metadata.get("doc_type", ""):
doc_lang = "shex"
doc_lang = (
"sparql" if "query" in doc.metadata.get("doc_type", "")
else "shex" if "schema" in doc.metadata.get("doc_type", "")
else ""
)
return f"<document>\n{doc.page_content} ({doc.metadata.get('endpoint_url', '')}):\n\n```{doc_lang}\n{doc.metadata.get('answer')}\n```\n</document>"

formatted_docs = f"<documents>\n{'\n'.join(_format_doc(doc) for doc in retrieved_docs)}\n</documents>"
Expand Down Expand Up @@ -335,17 +343,14 @@ async def on_message(msg: cl.Message):
retrieved_docs = retrieve_docs(msg.content)
formatted_docs = f"<documents>\n{'\n'.join(_format_doc(doc) for doc in retrieved_docs)}\n</documents>"
async with cl.Step(name=f"{len(retrieved_docs)} relevant documents") as step:
step.input = msg.content
step.output = formatted_docs

prompt_with_context = prompt_template.invoke({
"messages": [("human", msg.content)],
"retrieved_docs": formatted_docs,
})
final_answer = cl.Message(content="")
for resp in llm.stream(prompt_with_context):
if resp.content:
await final_answer.stream_token(resp.content)
await final_answer.stream_token(resp.content)
await final_answer.send()
```

Expand All @@ -359,4 +364,8 @@ uv run chainlit run app.py

## Creating more complex "agents"

e.g. reactive agent that can loop over themselves and use tools using [LangGraph](https://langchain-ai.github.io/langgraph/#)
e.g. reactive agent that can loop over themselves using [LangGraph](https://langchain-ai.github.io/langgraph/#):

- To validate a generated query
- To use tools

0 comments on commit d29a9d6

Please sign in to comment.