use logging instead of print in sparql-llm. add SparqlInfoLoader

sib-swiss · Feb 19, 2025 · a7bbcf5 · a7bbcf5
1 parent f6bb74f
commit a7bbcf5
Show file tree

Hide file tree

Showing 27 changed files with 7,905 additions and 428 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,9 +8,8 @@ __pycache__/
 .*_cache
 .coverage
 .langgraph_api/
-.chainlit
 .files
-uv.lock
+# uv.lock
 
 # Data
 data/
@@ -21,5 +20,6 @@ nohup.out
 node_modules/
 
 packages/expasy-agent/src/expasy_agent/webapp
-tutorial/chainlit.md
-tutorial/public
+tutorial/public/logo_*.png
+tutorial/.chainlit/translations/*
+!tutorial/.chainlit/translations/en-US.json
diff --git a/chat-with-context/src/style.css b/chat-with-context/src/style.css
@@ -11,11 +11,8 @@
     border-left: 3px solid #ddd;
     background-color: #f5f5f5;
     width: fit-content;
-  }
 
-  /* Optional: Add subtle padding and background */
-  think {
-      padding: 0.5em 1em;
-      border-radius: 4px;
+    padding: 0.5em 1em;
+    border-radius: 4px;
   }
 }
diff --git a/compose.yml b/compose.yml
@@ -2,7 +2,7 @@ services:
 
   vectordb:
     # https://hub.docker.com/r/qdrant/qdrant/tags
-    image: docker.io/qdrant/qdrant:v1.13.2
+    image: docker.io/qdrant/qdrant:v1.13.4
     # image: qdrant/qdrant:v1.9.2-unprivileged # Unprivileged don't work when mounting a volume
     container_name: vectordb
     restart: unless-stopped

diff --git a/packages/expasy-agent/pyproject.toml b/packages/expasy-agent/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     # "langchain-anthropic >=0.1.23",
     # "langchain-fireworks >=0.1.7",
     # Dependencies to deploy the API
-    "sparql-llm >=0.0.3",
+    "sparql-llm >=0.0.8",
     "rdflib >=7.0.0",
     "httpx >=0.27.2",
     "fastapi >=0.115.8",

diff --git a/packages/expasy-agent/src/expasy_agent/config.py b/packages/expasy-agent/src/expasy_agent/config.py
@@ -8,7 +8,7 @@
 from langchain_core.runnables import RunnableConfig, ensure_config
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from qdrant_client import QdrantClient
-from sparql_llm.utils import SparqlEndpointInfo
+from sparql_llm.utils import SparqlEndpointLinks
 
 from expasy_agent import prompts
 
@@ -17,7 +17,7 @@ class Settings(BaseSettings):
     """Define the service settings for the agent that can be set using environment variables."""
 
     # The list of endpoints that will be indexed and supported by the service
-    endpoints: list[SparqlEndpointInfo] = [
+    endpoints: list[SparqlEndpointLinks] = [
         {
             # The label of the endpoint for clearer display
             "label": "UniProt",

diff --git a/packages/expasy-agent/src/expasy_agent/indexing/index_endpoints.py b/packages/expasy-agent/src/expasy_agent/indexing/index_endpoints.py
@@ -6,8 +6,7 @@
 from langchain_core.documents import Document
 from langchain_qdrant import QdrantVectorStore
 from rdflib import RDF, Dataset, Namespace
-from sparql_llm.sparql_examples_loader import SparqlExamplesLoader
-from sparql_llm.sparql_void_shapes_loader import SparqlVoidShapesLoader
+from sparql_llm import SparqlExamplesLoader, SparqlInfoLoader, SparqlVoidShapesLoader
 from sparql_llm.utils import get_prefixes_and_schema_for_endpoints
 
 from expasy_agent.config import settings
@@ -142,46 +141,28 @@ def init_vectordb() -> None:
         print(
             f"\n  🔎 Getting metadata for {endpoint['label']} at {endpoint['endpoint_url']}"
         )
-        queries_loader = SparqlExamplesLoader(
+        docs += SparqlExamplesLoader(
             endpoint["endpoint_url"],
             examples_file=endpoint.get("examples_file"),
-            verbose=True,
-        )
-        docs += queries_loader.load()
+        ).load()
 
-        void_loader = SparqlVoidShapesLoader(
+        docs += SparqlVoidShapesLoader(
             endpoint["endpoint_url"],
             prefix_map=prefix_map,
             void_file=endpoint.get("void_file"),
             examples_file=endpoint.get("examples_file"),
-            verbose=True,
-        )
-        docs += void_loader.load()
+        ).load()
 
         docs += load_schemaorg_description(endpoint)
         # NOTE: we dont use the ontology for now, schema from shex is better
         # docs += load_ontology(endpoint)
 
     # Add some documents for general information about the resources
-    resources_summary_question = "Which resources are available through this system?"
-    docs.append(
-        Document(
-            page_content=resources_summary_question,
-            metadata={
-                "question": resources_summary_question,
-                "answer": "This system helps to access the following resources from the Swiss Institute of Bioinformatics:"
-                + "\n- ".join(
-                    [
-                        f"{endpoint.get('label', '')}: {endpoint['endpoint_url']}"
-                        for endpoint in settings.endpoints
-                    ]
-                ),
-                # "endpoint_url": "https://sparql.uniprot.org/sparql/",
-                "iri": "http://www.uniprot.org/help/about",
-                "doc_type": "General information",
-            },
-        )
-    )
+    docs += SparqlInfoLoader(
+        settings.endpoints,
+        source_iri="https://www.expasy.org/",
+        org_label="from the Swiss Institute of Bioinformatics (SIB)",
+    ).load()
 
     # NOTE: Manually add infos for UniProt since we cant retrieve it for now. Taken from https://www.uniprot.org/help/about
     uniprot_description_question = "What is the SIB resource UniProt about?"

diff --git a/packages/sparql-llm/src/sparql_llm/__init__.py b/packages/sparql-llm/src/sparql_llm/__init__.py
@@ -1,8 +1,9 @@
 """Utilities to improve LLMs capabilities when working with SPARQL and RDF."""
 
-__version__ = "0.0.6"
+__version__ = "0.0.8"
 
-from .utils import SparqlEndpointInfo
+from .utils import SparqlEndpointLinks
 from .validate_sparql import validate_sparql_in_msg, validate_sparql_with_void
 from .sparql_examples_loader import SparqlExamplesLoader
 from .sparql_void_shapes_loader import SparqlVoidShapesLoader, get_shex_dict_from_void, get_shex_from_void
+from .sparql_info_loader import SparqlInfoLoader
diff --git a/packages/sparql-llm/src/sparql_llm/sparql_examples_loader.py b/packages/sparql-llm/src/sparql_llm/sparql_examples_loader.py
@@ -7,7 +7,7 @@
 from langchain_core.documents import Document
 from rdflib.plugins.sparql import prepareQuery
 
-from sparql_llm.utils import get_prefixes_for_endpoint, query_sparql
+from sparql_llm.utils import get_prefixes_for_endpoint, logger, query_sparql
 
 GET_SPARQL_EXAMPLES_QUERY = """PREFIX sh: <http://www.w3.org/ns/shacl#>
 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
@@ -25,7 +25,7 @@ class SparqlExamplesLoader(BaseLoader):
     Compatible with the LangChain framework.
     """
 
-    def __init__(self, endpoint_url: str, examples_file: Optional[str] = None, verbose: bool = False):
+    def __init__(self, endpoint_url: str, examples_file: Optional[str] = None):
         """
         Initialize the SparqlExamplesLoader.
 
@@ -34,7 +34,6 @@ def __init__(self, endpoint_url: str, examples_file: Optional[str] = None, verbo
         """
         self.endpoint_url = endpoint_url
         self.examples_file = examples_file
-        self.verbose = verbose
 
     def load(self) -> list[Document]:
         """Load and return documents from the SPARQL endpoint."""
@@ -47,10 +46,9 @@ def load(self) -> list[Document]:
             ]["bindings"]:
                 docs.append(self._create_document(row, prefix_map))
         except Exception as e:
-            print(f"Could not retrieve SPARQL query examples from endpoint {self.endpoint_url}: {e}")
+            logger.warning(f"Could not retrieve SPARQL query examples from endpoint {self.endpoint_url}: {e}")
 
-        if self.verbose:
-            print(f"Found {len(docs)} examples queries for {self.endpoint_url}")
+        logger.info(f"Found {len(docs)} examples queries for {self.endpoint_url}")
         return docs
 
     def _create_document(self, row: Any, prefix_map: dict[str, str]) -> Document:

diff --git a/packages/sparql-llm/src/sparql_llm/sparql_info_loader.py b/packages/sparql-llm/src/sparql_llm/sparql_info_loader.py
@@ -0,0 +1,48 @@
+from typing import Optional
+
+from langchain_core.document_loaders.base import BaseLoader
+from langchain_core.documents import Document
+
+from sparql_llm.utils import SparqlEndpointLinks, logger
+
+DOC_TYPE = "General information"
+
+
+class SparqlInfoLoader(BaseLoader):
+    """Load informations for a list of SPARQL endpoints."""
+
+    def __init__(self, endpoints: list[SparqlEndpointLinks], source_iri: Optional[str] = None, org_label: str = ""):
+        """Initialize the SparqlInfoLoader."""
+        self.endpoints = endpoints
+        self.source_iri = source_iri
+        self.org_label = org_label
+
+    def load(self) -> list[Document]:
+        """Load and return documents from the SPARQL endpoint."""
+        docs: list[Document] = []
+
+        resources_summary_question = "Which resources are available through this system?"
+        metadata = {
+            "question": resources_summary_question,
+            "answer": f"This system helps to access the following SPARQL endpoints {self.org_label}:\n- "
+            + "\n- ".join(
+                [
+                    f"{endpoint.get('label')}: {endpoint['endpoint_url']}"
+                    if endpoint.get("label")
+                    else f"{endpoint['endpoint_url']}"
+                    for endpoint in self.endpoints
+                ]
+            ),
+            "doc_type": DOC_TYPE,
+        }
+        if self.source_iri:
+            metadata["iri"] = self.source_iri
+        docs.append(
+            Document(
+                page_content=resources_summary_question,
+                metadata=metadata,
+            )
+        )
+
+        logger.info(f"Added {len(docs)} documents with general informations")
+        return docs
diff --git a/packages/sparql-llm/src/sparql_llm/sparql_void_shapes_loader.py b/packages/sparql-llm/src/sparql_llm/sparql_void_shapes_loader.py
@@ -3,7 +3,13 @@
 from langchain_core.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 
-from sparql_llm.utils import get_prefix_converter, get_prefixes_for_endpoint, get_schema_for_endpoint, query_sparql
+from sparql_llm.utils import (
+    get_prefix_converter,
+    get_prefixes_for_endpoint,
+    get_schema_for_endpoint,
+    logger,
+    query_sparql,
+)
 
 DEFAULT_NAMESPACES_TO_IGNORE = [
     "http://www.w3.org/ns/sparql-service-description#",
@@ -96,7 +102,7 @@ def get_shex_dict_from_void(
                 # shex_dict[cls]["label"] += f": {label_triple['comment']['value']}"
                 shex_dict[cls]["comment"] = label_triple["comment"]["value"]
     except Exception as e:
-        print(f"Could not retrieve labels for classes in endpoint {endpoint_url}: {e}")
+        logger.warning(f"Could not retrieve labels for classes in endpoint {endpoint_url}: {e}")
 
     return shex_dict
 
@@ -135,7 +141,6 @@ def __init__(
         examples_file: Optional[str] = None,
         namespaces_to_ignore: Optional[list[str]] = None,
         prefix_map: Optional[dict[str, str]] = None,
-        verbose: bool = False,
     ):
         """
         Initialize the SparqlVoidShapesLoader.
@@ -148,7 +153,6 @@ def __init__(
         self.examples_file = examples_file
         self.prefix_map = prefix_map
         self.namespaces_to_ignore = namespaces_to_ignore
-        self.verbose = verbose
 
     def load(self) -> list[Document]:
         """Load and return documents from the SPARQL endpoint."""
@@ -188,6 +192,5 @@ def load(self) -> list[Document]:
                     )
                 )
 
-        if self.verbose:
-            print(f"Extracted {len(docs)} ShEx shapes for {self.endpoint_url}")
+        logger.info(f"Extracted {len(docs)} ShEx shapes for {self.endpoint_url}")
         return docs
diff --git a/packages/sparql-llm/src/sparql_llm/utils.py b/packages/sparql-llm/src/sparql_llm/utils.py
@@ -1,25 +1,34 @@
 import json
+import logging
 from typing import Any, Optional, TypedDict
 
 import httpx
 import rdflib
 from curies_rs import Converter
 
-# import logging
-# logging.getLogger("httpx").setLevel(logging.WARNING)
-
-# Prefixes utilities
+# Disable logger in your code with logging.getLogger("sparql_llm").setLevel(logging.WARNING)
+logger = logging.getLogger("sparql_llm")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("%(levelname)s: %(message)s")
+# formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s")
+# handler.setFormatter(formatter)
+logger.addHandler(handler)
 
 
-class SparqlEndpointInfo(TypedDict, total=False):
+class SparqlEndpointLinks(TypedDict, total=False):
     """A dictionary to store links and filepaths about a SPARQL endpoint."""
 
     endpoint_url: str
     void_file: Optional[str]
     examples_file: Optional[str]
     homepage_url: Optional[str]
+    label: Optional[str]
+    # ontology_url: Optional[str]
 
 
+# Prefixes utilities
+
 GET_PREFIXES_QUERY = """PREFIX sh: <http://www.w3.org/ns/shacl#>
 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 SELECT DISTINCT ?prefix ?namespace
@@ -30,7 +39,7 @@ class SparqlEndpointInfo(TypedDict, total=False):
 
 
 def get_prefixes_and_schema_for_endpoints(
-    endpoints: list[SparqlEndpointInfo],
+    endpoints: list[SparqlEndpointLinks],
 ) -> tuple[dict[str, str], "EndpointsSchemaDict"]:
     """Return a dictionary of prefixes and a dictionary of VoID classes schema for the given endpoints."""
     prefixes_map: dict[str, str] = {}
@@ -54,7 +63,7 @@ def get_prefixes_for_endpoint(
             if row["namespace"]["value"] not in prefixes_map.values():
                 prefixes_map[row["prefix"]["value"]] = row["namespace"]["value"]
     except Exception as e:
-        print(f"Error retrieving prefixes for {endpoint_url}: {e}")
+        logger.warning(f"Error retrieving prefixes for {endpoint_url}: {e}")
     return prefixes_map
 
 
@@ -143,7 +152,7 @@ def get_schema_for_endpoint(endpoint_url: str, void_file: Optional[str] = None)
         if len(void_dict) == 0:
             raise Exception("No VoID description found")
     except Exception as e:
-        print(f"Could not retrieve VoID description from {void_file if void_file else endpoint_url}: {e}")
+        logger.warning(f"Could not retrieve VoID description from {void_file if void_file else endpoint_url}: {e}")
     return void_dict
 
 

diff --git a/packages/sparql-llm/src/sparql_llm/validate_sparql.py b/packages/sparql-llm/src/sparql_llm/validate_sparql.py
@@ -13,6 +13,7 @@
     get_prefix_converter,
     get_prefixes_for_endpoint,
     get_schema_for_endpoint,
+    logger,
 )
 
 queries_pattern = re.compile(r"```sparql(.*?)```", re.DOTALL)
@@ -193,7 +194,7 @@ def validate_triple_pattern(
 
         # No type provided directly for this entity, we check if provided predicates match one of the potential type inferred for parent type
         elif parent_type and parent_pred:
-            # print(f"CHECKING subject {subj} parent type {parent_type} parent pred {parent_pred}")
+            # print(f"Checking subject {subj} parent type {parent_type} parent pred {parent_pred}")
             missing_pred = None
             potential_types = void_dict.get(parent_type, {}).get(parent_pred, [])
             if potential_types:
@@ -205,7 +206,7 @@ def validate_triple_pattern(
                     # Find any predicate in pred_dict.keys() that is not in potential_preds
                     missing_pred = next((pred for pred in pred_dict if pred not in potential_preds), None)
                     if missing_pred is None:
-                        # print(f"OK Subject {subj} is a valid inferred {potential_type}!")
+                        # print(f"Subject {subj} is a valid inferred {potential_type}!")
                         for pred in pred_dict:
                             for obj in pred_dict[pred]:
                                 # If object is variable, we try to validate it too passing the potential type we just validated
@@ -215,7 +216,7 @@ def validate_triple_pattern(
                                     )
                         break
                 if missing_pred is not None:
-                    # print(f"!!!! Subject {subj} {parent_type} {parent_pred} is not a valid {potential_types} !")
+                    # print(f"Subject {subj} {parent_type} {parent_pred} is not a valid {potential_types} !")
                     issues.add(
                         f"Subject {subj} in endpoint {endpoint} does not support the predicate `{prefix_converter.compress_list([missing_pred])[0]}`. Correct predicate might be one of the following: `{'`, `'.join(prefix_converter.compress_list(list(potential_preds)))}` (we inferred this variable might be of the type `{prefix_converter.compress_list([potential_type])[0]}`)"
                     )
@@ -264,7 +265,9 @@ def validate_triple_pattern(
             try:
                 issues_msgs = validate_triple_pattern(subj, subj_dict, void_dict, endpoint, issues_msgs)
             except Exception as e:
-                print(f"Error validating triples for subject {subj} in endpoint {endpoint} and query {query}: {e!s}")
+                logger.warning(
+                    f"Error validating triples for subject {subj} in endpoint {endpoint} and query {query}: {e!s}"
+                )
 
     return issues_msgs