feat: added --filter param to search command (#10)

ErikBjare · web-flow · commit 6c1900faebc5 · 2025-02-02T18:16:51.000+01:00
diff --git a/gptme_rag/cli.py b/gptme_rag/cli.py
@@ -358,6 +358,12 @@ def index(
     type=click.Choice(["cuda", "cpu"]),
     help="Device to run embeddings on (cuda or cpu)",
 )
+@click.option(
+    "--filter",
+    "-f",
+    multiple=True,
+    help="Filter results by path pattern (glob). Can be specified multiple times.",
+)
 def search(
     query: str,
     paths: list[Path],
@@ -371,6 +377,7 @@ def search(
     weights: str | None,
     embedding_function: str | None,
     device: str | None,
+    filter: tuple[str, ...],
 ):
     """Search the index and assemble context."""
     paths = [path.resolve() for path in paths]
@@ -405,13 +412,27 @@ def search(
                 device=device or "cpu",
             )
             assembler = ContextAssembler(max_tokens=max_tokens)
+
+            # Combine paths and filters for search
+            search_paths = list(paths)
+            if filter:
+                # If no paths were specified but filters are present,
+                # search from root and apply filters
+                if not paths:
+                    search_paths = [Path(".")]
+                logger.debug(f"Using path filters: {filter}")
+
             if explain:
                 documents, distances, explanations = indexer.search(
-                    query, n_results=n_results, paths=paths, explain=True
+                    query,
+                    n_results=n_results,
+                    paths=search_paths,
+                    path_filters=filter,
+                    explain=True,
                 )
             else:
                 documents, distances, _ = indexer.search(
-                    query, n_results=n_results, paths=paths
+                    query, n_results=n_results, paths=search_paths, path_filters=filter
                 )
         finally:
             sys.stdout.close()
diff --git a/gptme_rag/indexing/indexer.py b/gptme_rag/indexing/indexer.py
@@ -483,16 +483,85 @@ def compute_relevance_score(
 
         return total_score, scores
 
-    def _matches_paths(self, doc: Document, paths: list[Path]) -> bool:
-        """Check if document matches any of the given paths."""
+    def _matches_paths(
+        self,
+        doc: Document,
+        paths: list[Path] | None = None,
+        path_filters: tuple[str, ...] | None = None,
+    ) -> bool:
+        """Check if document matches any of the given paths or filters.
+
+        Args:
+            doc: Document to check
+            paths: List of paths to match against (exact path matching)
+            path_filters: List of glob patterns to match against
+
+        Returns:
+            bool: True if document matches any path or filter
+        """
         source = doc.metadata.get("source", "")
         if not source:
             return False
+
         source_path = Path(source)
-        return any(
-            path.resolve() in source_path.parents or path.resolve() == source_path
-            for path in paths
-        )
+
+        path_match = True
+        filter_match = True
+
+        # Check exact path matches if paths are specified
+        if paths:
+            path_match = any(
+                path.resolve() in source_path.parents or path.resolve() == source_path
+                for path in paths
+            )
+            if not path_match:
+                logger.debug(f"Path match failed: {source_path} not in {paths}")
+                return False
+
+        # Check pattern matches if filters are specified
+        if path_filters:
+            # Get both the full path and relative components for matching
+            source_str = str(source_path)
+            source_name = source_path.name
+            source_parts = source_path.parts
+
+            filter_match = False  # Set to True if any pattern matches
+            for pattern in path_filters:
+                logger.debug(f"Checking pattern: {pattern} against {source_str}")
+
+                # Handle different pattern types
+                if pattern.startswith("*."):
+                    # Simple extension filter
+                    if source_name.endswith(pattern[1:]):
+                        logger.debug(f"Matched extension pattern: {pattern}")
+                        filter_match = True
+                        break
+                else:
+                    # Convert pattern to parts for matching
+                    pattern_path = Path(pattern)
+                    pattern_parts = pattern_path.parts
+
+                    # Try different matching strategies
+                    if (
+                        fnmatch_path(source_str, pattern)
+                        or fnmatch_path(source_str, f"**/{pattern}")
+                        or (
+                            len(pattern_parts) <= len(source_parts)
+                            and fnmatch_path(
+                                str(Path(*source_parts[-len(pattern_parts) :])), pattern
+                            )
+                        )
+                    ):
+                        logger.debug(f"Matched path pattern: {pattern}")
+                        filter_match = True
+                        break
+
+            if not filter_match:
+                logger.debug(f"No patterns matched: {source_str}")
+                return False
+
+        # Both conditions must be met (if specified)
+        return path_match and filter_match
 
     def search(
         self,
@@ -503,16 +572,80 @@ def search(
         group_chunks: bool = True,
         max_attempts: int = 3,
         explain: bool = False,
+        path_filters: tuple[str, ...] | None = None,
     ) -> tuple[list[Document], list[float], list[dict[str, Any]] | None]:
-        """Search for documents similar to the query."""
+        """Search for documents similar to the query.
+
+        Args:
+            query: The search query text
+            paths: List of paths to search within (exact path matching)
+            n_results: Maximum number of results to return
+            where: Additional where clauses for ChromaDB query
+            group_chunks: Whether to group chunks from the same document
+            max_attempts: Maximum number of search attempts
+            explain: Whether to return scoring explanations
+            path_filters: Glob patterns to filter documents by path. Supports:
+                - Simple extension filters (*.md, *.py)
+                - Path patterns (src/*.py, docs/**/*.md)
+                - Multiple patterns can be combined
+
+        Returns:
+            Tuple of (documents, distances, explanations)
+            - documents: List of matching Document objects
+            - distances: List of embedding distances
+            - explanations: List of scoring explanations (if explain=True)
+
+        Examples:
+            # Search in markdown files
+            search("query", path_filters=("*.md",))
+
+            # Search in Python files in src directory
+            search("query", path_filters=("src/**/*.py",))
+
+            # Search in multiple file types
+            search("query", path_filters=("*.md", "*.py"))
+
+            # Combine paths and filters
+            search("query", paths=[Path("docs")], path_filters=("*.md",))
+        """
         # Get more results than needed to allow for filtering
         query_n_results = n_results * 3 if group_chunks else n_results
 
+        # Prepare where clause
+        search_where = where.copy() if where else {}
+
+        # Pre-filter documents based on all patterns
+        if path_filters:
+            logger.debug(f"Filtering with patterns: {path_filters}")
+            all_docs = self.collection.get()
+            matching_sources = set()
+
+            for meta in all_docs["metadatas"]:
+                if not meta or "source" not in meta:
+                    continue
+
+                source_path = Path(meta["source"])
+                # Create a dummy document for path matching
+                doc = Document(
+                    content="", metadata=meta, doc_id="temp", source_path=source_path
+                )
+
+                # Use _matches_paths to check all patterns
+                if self._matches_paths(doc, paths=None, path_filters=path_filters):
+                    matching_sources.add(str(source_path))
+
+            if matching_sources:
+                logger.debug(f"Found {len(matching_sources)} matching files")
+                search_where["source"] = {"$in": list(matching_sources)}
+            else:
+                logger.debug("No files matched the filter patterns")
+                return [], [], [] if explain else None
+
         # Query the collection
         results = self.collection.query(
             query_texts=[query],
             n_results=query_n_results,
-            where=where,
+            where=search_where,
         )
 
         if not results["ids"][0]:
@@ -530,7 +663,7 @@ def search(
                         metadata=results["metadatas"][0][i],
                         doc_id=doc_id,
                     )
-                    if not paths or self._matches_paths(doc, paths):
+                    if self._matches_paths(doc, paths, path_filters):
                         docs_by_source[source_id] = (doc, results["distances"][0][i])
 
             # Take top n results
@@ -541,7 +674,7 @@ def search(
         else:
             # Process individual chunks
             documents, distances, _ = self._process_individual_chunks(
-                results, paths, n_results, explain
+                results, paths, n_results, explain, path_filters
             )
 
         # Add explanations if requested
@@ -564,6 +697,7 @@ def _process_individual_chunks(
         paths: list[Path] | None,
         n_results: int,
         explain: bool,
+        path_filters: tuple[str, ...] | None = None,
     ) -> tuple[list[Document], list[float], list[dict]]:
         """Process search results as individual chunks."""
         documents: list[Document] = []
@@ -583,7 +717,7 @@ def _process_individual_chunks(
                 doc_id=doc_id,
             )
 
-            if paths and not self._matches_paths(doc, paths):
+            if not self._matches_paths(doc, paths, path_filters):
                 continue
 
             documents.append(doc)
diff --git a/tests/test_indexing.py b/tests/test_indexing.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 import pytest
 from gptme_rag.indexing.document import Document
 
@@ -72,24 +74,100 @@ def test_indexer_add_documents(indexer, test_docs):
 
 
 def test_indexer_directory(indexer, tmp_path):
-    # Create test files
-    (tmp_path / "test1.txt").write_text("Content about Python")
-    (tmp_path / "test2.txt").write_text("Content about JavaScript")
-    (tmp_path / "subdir").mkdir()
-    (tmp_path / "subdir" / "test3.txt").write_text("Content about TypeScript")
+    # Create test files in different directories with different extensions
+    docs_dir = tmp_path / "docs"
+    src_dir = tmp_path / "src"
+    docs_dir.mkdir()
+    src_dir.mkdir()
 
-    indexer.index_directory(tmp_path)
+    # Create markdown files in docs
+    (docs_dir / "guide.md").write_text("Python programming guide")
+    (docs_dir / "tutorial.md").write_text("JavaScript tutorial")
 
-    # Search for programming languages
-    python_results, python_distances, _ = indexer.search("Python")
-    js_results, js_distances, _ = indexer.search("JavaScript")
-    ts_results, ts_distances, _ = indexer.search("TypeScript")
+    # Create Python files in src
+    (src_dir / "main.py").write_text("def main(): print('Hello')")
+    (src_dir / "utils.py").write_text("def util(): return True")
 
-    assert len(python_results) > 0
-    assert len(js_results) > 0
-    assert len(ts_results) > 0
+    # Create a text file in root
+    (tmp_path / "notes.txt").write_text("Random notes")
+
+    # Index everything
+    indexer.index_directory(tmp_path)
 
-    # Verify distances are returned
-    assert len(python_distances) > 0
-    assert len(js_distances) > 0
-    assert len(ts_distances) > 0
+    # Test extension filter (*.md)
+    md_results, _, _ = indexer.search(
+        "programming",
+        path_filters=("*.md",),
+    )
+    assert len(md_results) > 0
+    assert all(doc.metadata["source"].endswith(".md") for doc in md_results)
+
+    # Test directory pattern (src/*.py)
+    py_results, _, _ = indexer.search(
+        "def",
+        path_filters=(str(src_dir / "*.py"),),
+    )
+    assert len(py_results) > 0
+    assert all(
+        Path(doc.metadata["source"]).parent.name == "src"
+        and doc.metadata["source"].endswith(".py")
+        for doc in py_results
+    )
+
+    # Test multiple patterns
+    multi_results, _, _ = indexer.search(
+        "programming",
+        path_filters=("*.md", "*.py"),
+    )
+    assert len(multi_results) > 0
+    assert all(doc.metadata["source"].endswith((".md", ".py")) for doc in multi_results)
+
+    # Test with path and filter combined
+    docs_md_results, _, _ = indexer.search(
+        "tutorial",
+        paths=[docs_dir],
+        path_filters=("*.md",),
+    )
+    assert len(docs_md_results) > 0
+    assert all(
+        Path(doc.metadata["source"]).parent.name == "docs"
+        and doc.metadata["source"].endswith(".md")
+        for doc in docs_md_results
+    )
+
+
+def test_path_matching(indexer):
+    # Test the _matches_paths method directly
+    doc = Document(
+        content="test",
+        metadata={"source": "/home/user/project/docs/guide.md"},
+        doc_id="test",
+    )
+
+    # Test simple extension filter
+    assert indexer._matches_paths(doc, path_filters=("*.md",))
+    assert not indexer._matches_paths(doc, path_filters=("*.py",))
+
+    # Test directory pattern
+    assert indexer._matches_paths(doc, path_filters=("docs/*.md",))
+    assert not indexer._matches_paths(doc, path_filters=("src/*.md",))
+
+    # Test multiple patterns
+    assert indexer._matches_paths(doc, path_filters=("*.py", "*.md"))
+    assert indexer._matches_paths(doc, path_filters=("src/*.py", "docs/*.md"))
+
+    # Test with exact paths
+    assert indexer._matches_paths(doc, paths=[Path("/home/user/project/docs")])
+    assert not indexer._matches_paths(doc, paths=[Path("/home/user/project/src")])
+
+    # Test combining paths and filters
+    assert indexer._matches_paths(
+        doc,
+        paths=[Path("/home/user/project/docs")],
+        path_filters=("*.md",),
+    )
+    assert not indexer._matches_paths(
+        doc,
+        paths=[Path("/home/user/project/docs")],
+        path_filters=("*.py",),
+    )