@@ -483,16 +483,85 @@ def compute_relevance_score(
483
483
484
484
return total_score , scores
485
485
486
- def _matches_paths (self , doc : Document , paths : list [Path ]) -> bool :
487
- """Check if document matches any of the given paths."""
486
+ def _matches_paths (
487
+ self ,
488
+ doc : Document ,
489
+ paths : list [Path ] | None = None ,
490
+ path_filters : tuple [str , ...] | None = None ,
491
+ ) -> bool :
492
+ """Check if document matches any of the given paths or filters.
493
+
494
+ Args:
495
+ doc: Document to check
496
+ paths: List of paths to match against (exact path matching)
497
+ path_filters: List of glob patterns to match against
498
+
499
+ Returns:
500
+ bool: True if document matches any path or filter
501
+ """
488
502
source = doc .metadata .get ("source" , "" )
489
503
if not source :
490
504
return False
505
+
491
506
source_path = Path (source )
492
- return any (
493
- path .resolve () in source_path .parents or path .resolve () == source_path
494
- for path in paths
495
- )
507
+
508
+ path_match = True
509
+ filter_match = True
510
+
511
+ # Check exact path matches if paths are specified
512
+ if paths :
513
+ path_match = any (
514
+ path .resolve () in source_path .parents or path .resolve () == source_path
515
+ for path in paths
516
+ )
517
+ if not path_match :
518
+ logger .debug (f"Path match failed: { source_path } not in { paths } " )
519
+ return False
520
+
521
+ # Check pattern matches if filters are specified
522
+ if path_filters :
523
+ # Get both the full path and relative components for matching
524
+ source_str = str (source_path )
525
+ source_name = source_path .name
526
+ source_parts = source_path .parts
527
+
528
+ filter_match = False # Set to True if any pattern matches
529
+ for pattern in path_filters :
530
+ logger .debug (f"Checking pattern: { pattern } against { source_str } " )
531
+
532
+ # Handle different pattern types
533
+ if pattern .startswith ("*." ):
534
+ # Simple extension filter
535
+ if source_name .endswith (pattern [1 :]):
536
+ logger .debug (f"Matched extension pattern: { pattern } " )
537
+ filter_match = True
538
+ break
539
+ else :
540
+ # Convert pattern to parts for matching
541
+ pattern_path = Path (pattern )
542
+ pattern_parts = pattern_path .parts
543
+
544
+ # Try different matching strategies
545
+ if (
546
+ fnmatch_path (source_str , pattern )
547
+ or fnmatch_path (source_str , f"**/{ pattern } " )
548
+ or (
549
+ len (pattern_parts ) <= len (source_parts )
550
+ and fnmatch_path (
551
+ str (Path (* source_parts [- len (pattern_parts ) :])), pattern
552
+ )
553
+ )
554
+ ):
555
+ logger .debug (f"Matched path pattern: { pattern } " )
556
+ filter_match = True
557
+ break
558
+
559
+ if not filter_match :
560
+ logger .debug (f"No patterns matched: { source_str } " )
561
+ return False
562
+
563
+ # Both conditions must be met (if specified)
564
+ return path_match and filter_match
496
565
497
566
def search (
498
567
self ,
@@ -503,16 +572,80 @@ def search(
503
572
group_chunks : bool = True ,
504
573
max_attempts : int = 3 ,
505
574
explain : bool = False ,
575
+ path_filters : tuple [str , ...] | None = None ,
506
576
) -> tuple [list [Document ], list [float ], list [dict [str , Any ]] | None ]:
507
- """Search for documents similar to the query."""
577
+ """Search for documents similar to the query.
578
+
579
+ Args:
580
+ query: The search query text
581
+ paths: List of paths to search within (exact path matching)
582
+ n_results: Maximum number of results to return
583
+ where: Additional where clauses for ChromaDB query
584
+ group_chunks: Whether to group chunks from the same document
585
+ max_attempts: Maximum number of search attempts
586
+ explain: Whether to return scoring explanations
587
+ path_filters: Glob patterns to filter documents by path. Supports:
588
+ - Simple extension filters (*.md, *.py)
589
+ - Path patterns (src/*.py, docs/**/*.md)
590
+ - Multiple patterns can be combined
591
+
592
+ Returns:
593
+ Tuple of (documents, distances, explanations)
594
+ - documents: List of matching Document objects
595
+ - distances: List of embedding distances
596
+ - explanations: List of scoring explanations (if explain=True)
597
+
598
+ Examples:
599
+ # Search in markdown files
600
+ search("query", path_filters=("*.md",))
601
+
602
+ # Search in Python files in src directory
603
+ search("query", path_filters=("src/**/*.py",))
604
+
605
+ # Search in multiple file types
606
+ search("query", path_filters=("*.md", "*.py"))
607
+
608
+ # Combine paths and filters
609
+ search("query", paths=[Path("docs")], path_filters=("*.md",))
610
+ """
508
611
# Get more results than needed to allow for filtering
509
612
query_n_results = n_results * 3 if group_chunks else n_results
510
613
614
+ # Prepare where clause
615
+ search_where = where .copy () if where else {}
616
+
617
+ # Pre-filter documents based on all patterns
618
+ if path_filters :
619
+ logger .debug (f"Filtering with patterns: { path_filters } " )
620
+ all_docs = self .collection .get ()
621
+ matching_sources = set ()
622
+
623
+ for meta in all_docs ["metadatas" ]:
624
+ if not meta or "source" not in meta :
625
+ continue
626
+
627
+ source_path = Path (meta ["source" ])
628
+ # Create a dummy document for path matching
629
+ doc = Document (
630
+ content = "" , metadata = meta , doc_id = "temp" , source_path = source_path
631
+ )
632
+
633
+ # Use _matches_paths to check all patterns
634
+ if self ._matches_paths (doc , paths = None , path_filters = path_filters ):
635
+ matching_sources .add (str (source_path ))
636
+
637
+ if matching_sources :
638
+ logger .debug (f"Found { len (matching_sources )} matching files" )
639
+ search_where ["source" ] = {"$in" : list (matching_sources )}
640
+ else :
641
+ logger .debug ("No files matched the filter patterns" )
642
+ return [], [], [] if explain else None
643
+
511
644
# Query the collection
512
645
results = self .collection .query (
513
646
query_texts = [query ],
514
647
n_results = query_n_results ,
515
- where = where ,
648
+ where = search_where ,
516
649
)
517
650
518
651
if not results ["ids" ][0 ]:
@@ -530,7 +663,7 @@ def search(
530
663
metadata = results ["metadatas" ][0 ][i ],
531
664
doc_id = doc_id ,
532
665
)
533
- if not paths or self ._matches_paths (doc , paths ):
666
+ if self ._matches_paths (doc , paths , path_filters ):
534
667
docs_by_source [source_id ] = (doc , results ["distances" ][0 ][i ])
535
668
536
669
# Take top n results
@@ -541,7 +674,7 @@ def search(
541
674
else :
542
675
# Process individual chunks
543
676
documents , distances , _ = self ._process_individual_chunks (
544
- results , paths , n_results , explain
677
+ results , paths , n_results , explain , path_filters
545
678
)
546
679
547
680
# Add explanations if requested
@@ -564,6 +697,7 @@ def _process_individual_chunks(
564
697
paths : list [Path ] | None ,
565
698
n_results : int ,
566
699
explain : bool ,
700
+ path_filters : tuple [str , ...] | None = None ,
567
701
) -> tuple [list [Document ], list [float ], list [dict ]]:
568
702
"""Process search results as individual chunks."""
569
703
documents : list [Document ] = []
@@ -583,7 +717,7 @@ def _process_individual_chunks(
583
717
doc_id = doc_id ,
584
718
)
585
719
586
- if paths and not self ._matches_paths (doc , paths ):
720
+ if not self ._matches_paths (doc , paths , path_filters ):
587
721
continue
588
722
589
723
documents .append (doc )
0 commit comments