Skip to content

Commit

Permalink
use logging instead of print in sparql-llm. add SparqlInfoLoader
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Feb 19, 2025
1 parent f6bb74f commit a7bbcf5
Show file tree
Hide file tree
Showing 27 changed files with 7,905 additions and 428 deletions.
8 changes: 4 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ __pycache__/
.*_cache
.coverage
.langgraph_api/
.chainlit
.files
uv.lock
# uv.lock

# Data
data/
Expand All @@ -21,5 +20,6 @@ nohup.out
node_modules/

packages/expasy-agent/src/expasy_agent/webapp
tutorial/chainlit.md
tutorial/public
tutorial/public/logo_*.png
tutorial/.chainlit/translations/*
!tutorial/.chainlit/translations/en-US.json
7 changes: 2 additions & 5 deletions chat-with-context/src/style.css
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,8 @@
border-left: 3px solid #ddd;
background-color: #f5f5f5;
width: fit-content;
}

/* Optional: Add subtle padding and background */
think {
padding: 0.5em 1em;
border-radius: 4px;
padding: 0.5em 1em;
border-radius: 4px;
}
}
2 changes: 1 addition & 1 deletion compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ services:

vectordb:
# https://hub.docker.com/r/qdrant/qdrant/tags
image: docker.io/qdrant/qdrant:v1.13.2
image: docker.io/qdrant/qdrant:v1.13.4
# image: qdrant/qdrant:v1.9.2-unprivileged # Unprivileged don't work when mounting a volume
container_name: vectordb
restart: unless-stopped
Expand Down
2 changes: 1 addition & 1 deletion packages/expasy-agent/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ dependencies = [
# "langchain-anthropic >=0.1.23",
# "langchain-fireworks >=0.1.7",
# Dependencies to deploy the API
"sparql-llm >=0.0.3",
"sparql-llm >=0.0.8",
"rdflib >=7.0.0",
"httpx >=0.27.2",
"fastapi >=0.115.8",
Expand Down
4 changes: 2 additions & 2 deletions packages/expasy-agent/src/expasy_agent/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from langchain_core.runnables import RunnableConfig, ensure_config
from pydantic_settings import BaseSettings, SettingsConfigDict
from qdrant_client import QdrantClient
from sparql_llm.utils import SparqlEndpointInfo
from sparql_llm.utils import SparqlEndpointLinks

from expasy_agent import prompts

Expand All @@ -17,7 +17,7 @@ class Settings(BaseSettings):
"""Define the service settings for the agent that can be set using environment variables."""

# The list of endpoints that will be indexed and supported by the service
endpoints: list[SparqlEndpointInfo] = [
endpoints: list[SparqlEndpointLinks] = [
{
# The label of the endpoint for clearer display
"label": "UniProt",
Expand Down
39 changes: 10 additions & 29 deletions packages/expasy-agent/src/expasy_agent/indexing/index_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
from langchain_core.documents import Document
from langchain_qdrant import QdrantVectorStore
from rdflib import RDF, Dataset, Namespace
from sparql_llm.sparql_examples_loader import SparqlExamplesLoader
from sparql_llm.sparql_void_shapes_loader import SparqlVoidShapesLoader
from sparql_llm import SparqlExamplesLoader, SparqlInfoLoader, SparqlVoidShapesLoader
from sparql_llm.utils import get_prefixes_and_schema_for_endpoints

from expasy_agent.config import settings
Expand Down Expand Up @@ -142,46 +141,28 @@ def init_vectordb() -> None:
print(
f"\n 🔎 Getting metadata for {endpoint['label']} at {endpoint['endpoint_url']}"
)
queries_loader = SparqlExamplesLoader(
docs += SparqlExamplesLoader(
endpoint["endpoint_url"],
examples_file=endpoint.get("examples_file"),
verbose=True,
)
docs += queries_loader.load()
).load()

void_loader = SparqlVoidShapesLoader(
docs += SparqlVoidShapesLoader(
endpoint["endpoint_url"],
prefix_map=prefix_map,
void_file=endpoint.get("void_file"),
examples_file=endpoint.get("examples_file"),
verbose=True,
)
docs += void_loader.load()
).load()

docs += load_schemaorg_description(endpoint)
# NOTE: we dont use the ontology for now, schema from shex is better
# docs += load_ontology(endpoint)

# Add some documents for general information about the resources
resources_summary_question = "Which resources are available through this system?"
docs.append(
Document(
page_content=resources_summary_question,
metadata={
"question": resources_summary_question,
"answer": "This system helps to access the following resources from the Swiss Institute of Bioinformatics:"
+ "\n- ".join(
[
f"{endpoint.get('label', '')}: {endpoint['endpoint_url']}"
for endpoint in settings.endpoints
]
),
# "endpoint_url": "https://sparql.uniprot.org/sparql/",
"iri": "http://www.uniprot.org/help/about",
"doc_type": "General information",
},
)
)
docs += SparqlInfoLoader(
settings.endpoints,
source_iri="https://www.expasy.org/",
org_label="from the Swiss Institute of Bioinformatics (SIB)",
).load()

# NOTE: Manually add infos for UniProt since we cant retrieve it for now. Taken from https://www.uniprot.org/help/about
uniprot_description_question = "What is the SIB resource UniProt about?"
Expand Down
5 changes: 3 additions & 2 deletions packages/sparql-llm/src/sparql_llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""Utilities to improve LLMs capabilities when working with SPARQL and RDF."""

__version__ = "0.0.6"
__version__ = "0.0.8"

from .utils import SparqlEndpointInfo
from .utils import SparqlEndpointLinks
from .validate_sparql import validate_sparql_in_msg, validate_sparql_with_void
from .sparql_examples_loader import SparqlExamplesLoader
from .sparql_void_shapes_loader import SparqlVoidShapesLoader, get_shex_dict_from_void, get_shex_from_void
from .sparql_info_loader import SparqlInfoLoader
10 changes: 4 additions & 6 deletions packages/sparql-llm/src/sparql_llm/sparql_examples_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from langchain_core.documents import Document
from rdflib.plugins.sparql import prepareQuery

from sparql_llm.utils import get_prefixes_for_endpoint, query_sparql
from sparql_llm.utils import get_prefixes_for_endpoint, logger, query_sparql

GET_SPARQL_EXAMPLES_QUERY = """PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
Expand All @@ -25,7 +25,7 @@ class SparqlExamplesLoader(BaseLoader):
Compatible with the LangChain framework.
"""

def __init__(self, endpoint_url: str, examples_file: Optional[str] = None, verbose: bool = False):
def __init__(self, endpoint_url: str, examples_file: Optional[str] = None):
"""
Initialize the SparqlExamplesLoader.
Expand All @@ -34,7 +34,6 @@ def __init__(self, endpoint_url: str, examples_file: Optional[str] = None, verbo
"""
self.endpoint_url = endpoint_url
self.examples_file = examples_file
self.verbose = verbose

def load(self) -> list[Document]:
"""Load and return documents from the SPARQL endpoint."""
Expand All @@ -47,10 +46,9 @@ def load(self) -> list[Document]:
]["bindings"]:
docs.append(self._create_document(row, prefix_map))
except Exception as e:
print(f"Could not retrieve SPARQL query examples from endpoint {self.endpoint_url}: {e}")
logger.warning(f"Could not retrieve SPARQL query examples from endpoint {self.endpoint_url}: {e}")

if self.verbose:
print(f"Found {len(docs)} examples queries for {self.endpoint_url}")
logger.info(f"Found {len(docs)} examples queries for {self.endpoint_url}")
return docs

def _create_document(self, row: Any, prefix_map: dict[str, str]) -> Document:
Expand Down
48 changes: 48 additions & 0 deletions packages/sparql-llm/src/sparql_llm/sparql_info_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import Optional

from langchain_core.document_loaders.base import BaseLoader
from langchain_core.documents import Document

from sparql_llm.utils import SparqlEndpointLinks, logger

DOC_TYPE = "General information"


class SparqlInfoLoader(BaseLoader):
"""Load informations for a list of SPARQL endpoints."""

def __init__(self, endpoints: list[SparqlEndpointLinks], source_iri: Optional[str] = None, org_label: str = ""):
"""Initialize the SparqlInfoLoader."""
self.endpoints = endpoints
self.source_iri = source_iri
self.org_label = org_label

def load(self) -> list[Document]:
"""Load and return documents from the SPARQL endpoint."""
docs: list[Document] = []

resources_summary_question = "Which resources are available through this system?"
metadata = {
"question": resources_summary_question,
"answer": f"This system helps to access the following SPARQL endpoints {self.org_label}:\n- "
+ "\n- ".join(
[
f"{endpoint.get('label')}: {endpoint['endpoint_url']}"
if endpoint.get("label")
else f"{endpoint['endpoint_url']}"
for endpoint in self.endpoints
]
),
"doc_type": DOC_TYPE,
}
if self.source_iri:
metadata["iri"] = self.source_iri
docs.append(
Document(
page_content=resources_summary_question,
metadata=metadata,
)
)

logger.info(f"Added {len(docs)} documents with general informations")
return docs
15 changes: 9 additions & 6 deletions packages/sparql-llm/src/sparql_llm/sparql_void_shapes_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
from langchain_core.document_loaders.base import BaseLoader
from langchain_core.documents import Document

from sparql_llm.utils import get_prefix_converter, get_prefixes_for_endpoint, get_schema_for_endpoint, query_sparql
from sparql_llm.utils import (
get_prefix_converter,
get_prefixes_for_endpoint,
get_schema_for_endpoint,
logger,
query_sparql,
)

DEFAULT_NAMESPACES_TO_IGNORE = [
"http://www.w3.org/ns/sparql-service-description#",
Expand Down Expand Up @@ -96,7 +102,7 @@ def get_shex_dict_from_void(
# shex_dict[cls]["label"] += f": {label_triple['comment']['value']}"
shex_dict[cls]["comment"] = label_triple["comment"]["value"]
except Exception as e:
print(f"Could not retrieve labels for classes in endpoint {endpoint_url}: {e}")
logger.warning(f"Could not retrieve labels for classes in endpoint {endpoint_url}: {e}")

return shex_dict

Expand Down Expand Up @@ -135,7 +141,6 @@ def __init__(
examples_file: Optional[str] = None,
namespaces_to_ignore: Optional[list[str]] = None,
prefix_map: Optional[dict[str, str]] = None,
verbose: bool = False,
):
"""
Initialize the SparqlVoidShapesLoader.
Expand All @@ -148,7 +153,6 @@ def __init__(
self.examples_file = examples_file
self.prefix_map = prefix_map
self.namespaces_to_ignore = namespaces_to_ignore
self.verbose = verbose

def load(self) -> list[Document]:
"""Load and return documents from the SPARQL endpoint."""
Expand Down Expand Up @@ -188,6 +192,5 @@ def load(self) -> list[Document]:
)
)

if self.verbose:
print(f"Extracted {len(docs)} ShEx shapes for {self.endpoint_url}")
logger.info(f"Extracted {len(docs)} ShEx shapes for {self.endpoint_url}")
return docs
25 changes: 17 additions & 8 deletions packages/sparql-llm/src/sparql_llm/utils.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
import json
import logging
from typing import Any, Optional, TypedDict

import httpx
import rdflib
from curies_rs import Converter

# import logging
# logging.getLogger("httpx").setLevel(logging.WARNING)

# Prefixes utilities
# Disable logger in your code with logging.getLogger("sparql_llm").setLevel(logging.WARNING)
logger = logging.getLogger("sparql_llm")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter("%(levelname)s: %(message)s")
# formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s")
# handler.setFormatter(formatter)
logger.addHandler(handler)


class SparqlEndpointInfo(TypedDict, total=False):
class SparqlEndpointLinks(TypedDict, total=False):
"""A dictionary to store links and filepaths about a SPARQL endpoint."""

endpoint_url: str
void_file: Optional[str]
examples_file: Optional[str]
homepage_url: Optional[str]
label: Optional[str]
# ontology_url: Optional[str]


# Prefixes utilities

GET_PREFIXES_QUERY = """PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?prefix ?namespace
Expand All @@ -30,7 +39,7 @@ class SparqlEndpointInfo(TypedDict, total=False):


def get_prefixes_and_schema_for_endpoints(
endpoints: list[SparqlEndpointInfo],
endpoints: list[SparqlEndpointLinks],
) -> tuple[dict[str, str], "EndpointsSchemaDict"]:
"""Return a dictionary of prefixes and a dictionary of VoID classes schema for the given endpoints."""
prefixes_map: dict[str, str] = {}
Expand All @@ -54,7 +63,7 @@ def get_prefixes_for_endpoint(
if row["namespace"]["value"] not in prefixes_map.values():
prefixes_map[row["prefix"]["value"]] = row["namespace"]["value"]
except Exception as e:
print(f"Error retrieving prefixes for {endpoint_url}: {e}")
logger.warning(f"Error retrieving prefixes for {endpoint_url}: {e}")
return prefixes_map


Expand Down Expand Up @@ -143,7 +152,7 @@ def get_schema_for_endpoint(endpoint_url: str, void_file: Optional[str] = None)
if len(void_dict) == 0:
raise Exception("No VoID description found")
except Exception as e:
print(f"Could not retrieve VoID description from {void_file if void_file else endpoint_url}: {e}")
logger.warning(f"Could not retrieve VoID description from {void_file if void_file else endpoint_url}: {e}")
return void_dict


Expand Down
11 changes: 7 additions & 4 deletions packages/sparql-llm/src/sparql_llm/validate_sparql.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
get_prefix_converter,
get_prefixes_for_endpoint,
get_schema_for_endpoint,
logger,
)

queries_pattern = re.compile(r"```sparql(.*?)```", re.DOTALL)
Expand Down Expand Up @@ -193,7 +194,7 @@ def validate_triple_pattern(

# No type provided directly for this entity, we check if provided predicates match one of the potential type inferred for parent type
elif parent_type and parent_pred:
# print(f"CHECKING subject {subj} parent type {parent_type} parent pred {parent_pred}")
# print(f"Checking subject {subj} parent type {parent_type} parent pred {parent_pred}")
missing_pred = None
potential_types = void_dict.get(parent_type, {}).get(parent_pred, [])
if potential_types:
Expand All @@ -205,7 +206,7 @@ def validate_triple_pattern(
# Find any predicate in pred_dict.keys() that is not in potential_preds
missing_pred = next((pred for pred in pred_dict if pred not in potential_preds), None)
if missing_pred is None:
# print(f"OK Subject {subj} is a valid inferred {potential_type}!")
# print(f"Subject {subj} is a valid inferred {potential_type}!")
for pred in pred_dict:
for obj in pred_dict[pred]:
# If object is variable, we try to validate it too passing the potential type we just validated
Expand All @@ -215,7 +216,7 @@ def validate_triple_pattern(
)
break
if missing_pred is not None:
# print(f"!!!! Subject {subj} {parent_type} {parent_pred} is not a valid {potential_types} !")
# print(f"Subject {subj} {parent_type} {parent_pred} is not a valid {potential_types} !")
issues.add(
f"Subject {subj} in endpoint {endpoint} does not support the predicate `{prefix_converter.compress_list([missing_pred])[0]}`. Correct predicate might be one of the following: `{'`, `'.join(prefix_converter.compress_list(list(potential_preds)))}` (we inferred this variable might be of the type `{prefix_converter.compress_list([potential_type])[0]}`)"
)
Expand Down Expand Up @@ -264,7 +265,9 @@ def validate_triple_pattern(
try:
issues_msgs = validate_triple_pattern(subj, subj_dict, void_dict, endpoint, issues_msgs)
except Exception as e:
print(f"Error validating triples for subject {subj} in endpoint {endpoint} and query {query}: {e!s}")
logger.warning(
f"Error validating triples for subject {subj} in endpoint {endpoint} and query {query}: {e!s}"
)

return issues_msgs

Expand Down
Loading

0 comments on commit a7bbcf5

Please sign in to comment.