Skip to content

Commit 4c9a7dd

Browse files
authored
fix: fixed reindexing of unchanged files, now uses last_modified stamp (#5)
1 parent 90ba8c5 commit 4c9a7dd

File tree

3 files changed

+33
-7
lines changed

3 files changed

+33
-7
lines changed

gptme_rag/cli.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import signal
55
import sys
66
import time
7+
from datetime import datetime
78
from pathlib import Path
89

910
import click
@@ -65,9 +66,21 @@ def index(paths: list[Path], pattern: str, persist_dir: Path):
6566
for doc in existing_docs:
6667
if "source" in doc.metadata:
6768
abs_path = os.path.abspath(doc.metadata["source"])
68-
mtime = doc.metadata.get("mtime", 0)
69-
existing_files[abs_path] = mtime
70-
logger.debug("Existing file: %s (mtime: %s)", abs_path, mtime)
69+
last_modified = doc.metadata.get("last_modified")
70+
if last_modified:
71+
try:
72+
# Parse ISO format timestamp to float
73+
existing_files[abs_path] = datetime.fromisoformat(
74+
last_modified
75+
).timestamp()
76+
except ValueError:
77+
logger.warning(
78+
"Invalid last_modified format: %s", last_modified
79+
)
80+
existing_files[abs_path] = 0
81+
else:
82+
existing_files[abs_path] = 0
83+
# logger.debug("Existing file: %s", abs_path) # Too spammy
7184

7285
logger.debug("Loaded %d existing files from index", len(existing_files))
7386

@@ -91,13 +104,15 @@ def index(paths: list[Path], pattern: str, persist_dir: Path):
91104
abs_source = os.path.abspath(source)
92105
doc.metadata["source"] = abs_source
93106
current_mtime = os.path.getmtime(abs_source)
94-
doc.metadata["mtime"] = current_mtime
95107

96108
# Include if file is new or modified
97109
if abs_source not in existing_files:
98110
logger.debug("New file: %s", abs_source)
99111
filtered_documents.append(doc)
100-
elif current_mtime > existing_files[abs_source]:
112+
# Round to microseconds (6 decimal places) for comparison
113+
elif round(current_mtime, 6) > round(
114+
existing_files[abs_source], 6
115+
):
101116
logger.debug(
102117
"Modified file: %s (current: %s, stored: %s)",
103118
abs_source,

gptme_rag/indexing/indexer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ def __init__(
9292
self.persist_directory = Path(persist_directory).expanduser().resolve()
9393
self.persist_directory.mkdir(parents=True, exist_ok=True)
9494
logger.info(f"Using persist directory: {self.persist_directory}")
95+
9596
settings.persist_directory = str(self.persist_directory)
9697
self.client = chromadb.PersistentClient(
9798
path=str(self.persist_directory), settings=settings
@@ -516,6 +517,9 @@ def list_documents(self, group_by_source: bool = True) -> list[Document]:
516517
"""
517518
# Get all documents from collection
518519
results = self.collection.get()
520+
logger.debug("ChromaDB returned %d documents", len(results["ids"]))
521+
if results["ids"]:
522+
logger.debug("First document metadata: %s", results["metadatas"][0])
519523

520524
if not results["ids"]:
521525
return []
@@ -912,4 +916,8 @@ def get_all_documents(self) -> list[Document]:
912916
Returns:
913917
List of all documents in the index, including all chunks.
914918
"""
915-
return self.list_documents(group_by_source=False)
919+
logger.debug("Getting all documents from index")
920+
docs = self.list_documents(group_by_source=False)
921+
for doc in docs:
922+
logger.debug("Retrieved document with metadata: %s", doc.metadata)
923+
return docs

gptme_rag/indexing/watcher.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import time
55
from pathlib import Path
6+
from datetime import datetime
67

78
from watchdog.events import FileSystemEvent, FileSystemEventHandler
89
from watchdog.observers import Observer
@@ -321,7 +322,9 @@ def _process_updates(self) -> None:
321322

322323
# Sort updates by modification time to get latest versions
323324
updates = sorted(
324-
existing_updates, key=lambda p: p.stat().st_mtime, reverse=True
325+
existing_updates,
326+
key=lambda p: datetime.fromtimestamp(p.stat().st_mtime),
327+
reverse=True,
325328
)
326329
logger.debug(f"Sorted updates: {[str(p) for p in updates]}")
327330

0 commit comments

Comments
 (0)