Skip to content

Commit efbe502

Browse files
xingyaowwopenhands-agentryanhoangt
authored
Replace diskcache with custom file-based storage system to resolve memory issues (#82)
* Fix memory issue in file_editor and add large history insert test * Replace diskcache with custom file-based storage system * Remove diskcache dependency from pyproject.toml * Simplify file_editor function by removing chunk serialization * Implement FileCache class and refactor FileHistoryManager to use it * Minimize changes while maintaining new functionality * Implement key hashing in FileCache to handle filepath issues * Update clear method in FileCache to handle nested directories * Fix FileCache issues: size limit functionality and empty string keys * Add test case for size limit with empty key * Update poetry.lock and pyproject.toml * Improve FileHistoryManager with consistent key generation and add get_all_history method * Fix FileCache and FileHistoryManager issues, update tests * Apply linter fixes * Fix FileCache size limit functionality and edge cases * Apply linting fixes * Fix JSON formatting in file_editor function * Apply linting fixes * Add detailed logging to FileCache for debugging * Apply linting fixes * Optimize memory usage in file_editor function * Apply ruff-format changes * Add FileCache import to __init__.py * Add FileCache import to openhands_aci/__init__.py * Update FileCache import in test_file_cache.py * Export FileCache in editor/__init__.py * Apply ruff linter fixes * use mtime * bump to 0.2.4 * use utf8 measurement --------- Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Hoang Tran <descience.thh10@gmail.com>
1 parent 707d6da commit efbe502

File tree

12 files changed

+524
-99
lines changed

12 files changed

+524
-99
lines changed

.openhands/microagents/repo.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,4 @@ CI/CD:
4343
- `lint.yml`: Code linting checks
4444
- `py-unit-tests.yml`: Unit tests
4545
- `py-intg-tests.yml`: Integration tests
46-
- `pypi-release.yml`: Package publishing to PyPI
46+
- `pypi-release.yml`: Package publishing to PyPI

openhands_aci/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from .editor import file_editor
2+
from .editor.file_cache import FileCache
23

3-
__all__ = ['file_editor']
4+
__all__ = ['file_editor', 'FileCache']

openhands_aci/editor/__init__.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,13 @@
33

44
from .editor import Command, OHEditor
55
from .exceptions import ToolError
6+
from .file_cache import FileCache
67
from .results import ToolResult
78

89
_GLOBAL_EDITOR = OHEditor()
910

11+
__all__ = ['Command', 'OHEditor', 'ToolError', 'ToolResult', 'FileCache', 'file_editor']
12+
1013

1114
def _make_api_tool_result(tool_result: ToolResult) -> str:
1215
"""Convert an agent ToolResult to an API ToolResultBlockParam."""
@@ -44,6 +47,20 @@ def file_editor(
4447

4548
formatted_output_and_error = _make_api_tool_result(result)
4649
marker_id = uuid.uuid4().hex
47-
return f"""<oh_aci_output_{marker_id}>
48-
{json.dumps(result.to_dict(extra_field={'formatted_output_and_error': formatted_output_and_error}), indent=2)}
49-
</oh_aci_output_{marker_id}>"""
50+
51+
def json_generator():
52+
yield '{'
53+
first = True
54+
for key, value in result.to_dict().items():
55+
if not first:
56+
yield ','
57+
first = False
58+
yield f'"{key}": {json.dumps(value)}'
59+
yield f', "formatted_output_and_error": {json.dumps(formatted_output_and_error)}'
60+
yield '}'
61+
62+
return (
63+
f'<oh_aci_output_{marker_id}>\n'
64+
+ ''.join(json_generator())
65+
+ f'\n</oh_aci_output_{marker_id}>'
66+
)

openhands_aci/editor/file_cache.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
import hashlib
2+
import json
3+
import logging
4+
import os
5+
import time
6+
from pathlib import Path
7+
from typing import Any, Optional
8+
9+
logging.basicConfig(level=logging.DEBUG)
10+
logger = logging.getLogger(__name__)
11+
12+
13+
class FileCache:
14+
def __init__(self, directory: str, size_limit: Optional[int] = None):
15+
self.directory = Path(directory)
16+
self.directory.mkdir(parents=True, exist_ok=True)
17+
self.size_limit = size_limit
18+
self.current_size = 0
19+
self._update_current_size()
20+
logger.debug(
21+
f'FileCache initialized with directory: {self.directory}, size_limit: {self.size_limit}, current_size: {self.current_size}'
22+
)
23+
24+
def _get_file_path(self, key: str) -> Path:
25+
hashed_key = hashlib.sha256(key.encode()).hexdigest()
26+
return self.directory / f'{hashed_key}.json'
27+
28+
def _update_current_size(self):
29+
self.current_size = sum(
30+
f.stat().st_size for f in self.directory.glob('*.json') if f.is_file()
31+
)
32+
logger.debug(f'Current size updated: {self.current_size}')
33+
34+
def set(self, key: str, value: Any) -> None:
35+
file_path = self._get_file_path(key)
36+
content = json.dumps({'key': key, 'value': value})
37+
content_size = len(content.encode('utf-8'))
38+
logger.debug(f'Setting key: {key}, content_size: {content_size}')
39+
40+
if self.size_limit is not None:
41+
if file_path.exists():
42+
old_size = file_path.stat().st_size
43+
size_diff = content_size - old_size
44+
logger.debug(
45+
f'Existing file: old_size: {old_size}, size_diff: {size_diff}'
46+
)
47+
if size_diff > 0:
48+
while (
49+
self.current_size + size_diff > self.size_limit
50+
and len(self) > 1
51+
):
52+
logger.debug(
53+
f'Evicting oldest (existing file case): current_size: {self.current_size}, size_limit: {self.size_limit}'
54+
)
55+
self._evict_oldest(file_path)
56+
else:
57+
while (
58+
self.current_size + content_size > self.size_limit and len(self) > 1
59+
):
60+
logger.debug(
61+
f'Evicting oldest (new file case): current_size: {self.current_size}, size_limit: {self.size_limit}'
62+
)
63+
self._evict_oldest(file_path)
64+
65+
if file_path.exists():
66+
self.current_size -= file_path.stat().st_size
67+
logger.debug(
68+
f'Existing file removed from current_size: {self.current_size}'
69+
)
70+
71+
with open(file_path, 'w') as f:
72+
f.write(content)
73+
74+
self.current_size += content_size
75+
logger.debug(f'File written, new current_size: {self.current_size}')
76+
os.utime(
77+
file_path, (time.time(), time.time())
78+
) # Update access and modification time
79+
80+
def _evict_oldest(self, exclude_path: Optional[Path] = None):
81+
oldest_file = min(
82+
(
83+
f
84+
for f in self.directory.glob('*.json')
85+
if f.is_file() and f != exclude_path
86+
),
87+
key=os.path.getmtime,
88+
)
89+
evicted_size = oldest_file.stat().st_size
90+
self.current_size -= evicted_size
91+
os.remove(oldest_file)
92+
logger.debug(
93+
f'Evicted file: {oldest_file}, size: {evicted_size}, new current_size: {self.current_size}'
94+
)
95+
96+
def get(self, key: str, default: Any = None) -> Any:
97+
file_path = self._get_file_path(key)
98+
if not file_path.exists():
99+
logger.debug(f'Get: Key not found: {key}')
100+
return default
101+
with open(file_path, 'r') as f:
102+
data = json.load(f)
103+
os.utime(file_path, (time.time(), time.time())) # Update access time
104+
logger.debug(f'Get: Key found: {key}')
105+
return data['value']
106+
107+
def delete(self, key: str) -> None:
108+
file_path = self._get_file_path(key)
109+
if file_path.exists():
110+
deleted_size = file_path.stat().st_size
111+
self.current_size -= deleted_size
112+
os.remove(file_path)
113+
logger.debug(
114+
f'Deleted key: {key}, size: {deleted_size}, new current_size: {self.current_size}'
115+
)
116+
117+
def clear(self) -> None:
118+
for item in self.directory.glob('*.json'):
119+
if item.is_file():
120+
os.remove(item)
121+
self.current_size = 0
122+
logger.debug('Cache cleared')
123+
124+
def __contains__(self, key: str) -> bool:
125+
exists = self._get_file_path(key).exists()
126+
logger.debug(f'Contains check: {key}, result: {exists}')
127+
return exists
128+
129+
def __len__(self) -> int:
130+
length = sum(1 for _ in self.directory.glob('*.json') if _.is_file())
131+
logger.debug(f'Cache length: {length}')
132+
return length
133+
134+
def __iter__(self):
135+
for file in self.directory.glob('*.json'):
136+
if file.is_file():
137+
with open(file, 'r') as f:
138+
data = json.load(f)
139+
logger.debug(f"Yielding key: {data['key']}")
140+
yield data['key']
141+
142+
def __getitem__(self, key: str) -> Any:
143+
return self.get(key)
144+
145+
def __setitem__(self, key: str, value: Any) -> None:
146+
self.set(key, value)

openhands_aci/editor/history.py

Lines changed: 68 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
"""History management for file edits with disk-based storage and memory constraints."""
22

3+
import logging
34
import tempfile
45
from pathlib import Path
5-
from typing import Optional
6+
from typing import List, Optional
67

7-
from diskcache import Cache
8+
from .file_cache import FileCache
89

910

1011
class FileHistoryManager:
@@ -21,67 +22,91 @@ def __init__(
2122
2223
Notes:
2324
- Each file's history is limited to the last N entries to conserve memory
24-
- The disk cache is limited to 500MB total to prevent excessive disk usage
25+
- The file cache is limited to prevent excessive disk usage
2526
- Older entries are automatically removed when limits are exceeded
2627
"""
2728
self.max_history_per_file = max_history_per_file
2829
if history_dir is None:
2930
history_dir = Path(tempfile.mkdtemp(prefix='oh_editor_history_'))
30-
self.cache = Cache(str(history_dir), size_limit=5e8) # 500MB size limit
31+
self.cache = FileCache(str(history_dir))
32+
self.logger = logging.getLogger(__name__)
33+
34+
def _get_metadata_key(self, file_path: Path) -> str:
35+
return f'{file_path}.metadata'
36+
37+
def _get_history_key(self, file_path: Path, counter: int) -> str:
38+
return f'{file_path}.{counter}'
3139

3240
def add_history(self, file_path: Path, content: str):
3341
"""Add a new history entry for a file."""
34-
key = str(file_path)
35-
# Get list of entry indices and counter for this file
36-
entries_key = f'{key}:entries'
37-
counter_key = f'{key}:counter'
38-
entries = self.cache.get(entries_key, [])
39-
counter = self.cache.get(counter_key, 0)
40-
41-
# Add new entry with monotonically increasing counter
42-
entry_key = f'{key}:{counter}'
43-
self.cache.set(entry_key, content)
44-
entries.append(entry_key)
45-
counter += 1
42+
metadata_key = self._get_metadata_key(file_path)
43+
metadata = self.cache.get(metadata_key, {'entries': [], 'counter': 0})
44+
counter = metadata['counter']
45+
46+
# Add new entry
47+
history_key = self._get_history_key(file_path, counter)
48+
self.cache.set(history_key, content)
49+
50+
metadata['entries'].append(counter)
51+
metadata['counter'] += 1
4652

4753
# Keep only last N entries
48-
if len(entries) > self.max_history_per_file:
49-
old_key = entries.pop(0)
50-
self.cache.delete(old_key)
54+
while len(metadata['entries']) > self.max_history_per_file:
55+
old_counter = metadata['entries'].pop(0)
56+
old_history_key = self._get_history_key(file_path, old_counter)
57+
self.cache.delete(old_history_key)
5158

52-
# Update entries list and counter
53-
self.cache.set(entries_key, entries)
54-
self.cache.set(counter_key, counter)
59+
self.cache.set(metadata_key, metadata)
5560

5661
def get_last_history(self, file_path: Path) -> Optional[str]:
5762
"""Get the most recent history entry for a file."""
58-
key = str(file_path)
59-
entries_key = f'{key}:entries'
60-
entries = self.cache.get(entries_key, [])
63+
metadata_key = self._get_metadata_key(file_path)
64+
metadata = self.cache.get(metadata_key, {'entries': [], 'counter': 0})
65+
entries = metadata['entries']
6166

6267
if not entries:
6368
return None
6469

65-
# Get and remove last entry
66-
last_key = entries.pop()
67-
content = self.cache.get(last_key)
68-
self.cache.delete(last_key)
70+
# Get the last entry without removing it
71+
last_counter = entries[-1]
72+
history_key = self._get_history_key(file_path, last_counter)
73+
content = self.cache.get(history_key)
74+
75+
if content is None:
76+
self.logger.warning(f'History entry not found for {file_path}')
77+
return None
6978

70-
# Update entries list
71-
self.cache.set(entries_key, entries)
7279
return content
7380

7481
def clear_history(self, file_path: Path):
7582
"""Clear history for a given file."""
76-
key = str(file_path)
77-
entries_key = f'{key}:entries'
78-
counter_key = f'{key}:counter'
79-
entries = self.cache.get(entries_key, [])
80-
81-
# Delete all entries
82-
for entry_key in entries:
83-
self.cache.delete(entry_key)
84-
85-
# Delete entries list and counter
86-
self.cache.delete(entries_key)
87-
self.cache.delete(counter_key)
83+
metadata_key = self._get_metadata_key(file_path)
84+
metadata = self.cache.get(metadata_key, {'entries': [], 'counter': 0})
85+
86+
# Delete all history entries
87+
for counter in metadata['entries']:
88+
history_key = self._get_history_key(file_path, counter)
89+
self.cache.delete(history_key)
90+
91+
# Clear metadata
92+
self.cache.set(metadata_key, {'entries': [], 'counter': 0})
93+
94+
def get_all_history(self, file_path: Path) -> List[str]:
95+
"""Get all history entries for a file."""
96+
metadata_key = self._get_metadata_key(file_path)
97+
metadata = self.cache.get(metadata_key, {'entries': [], 'counter': 0})
98+
entries = metadata['entries']
99+
100+
history = []
101+
for counter in entries:
102+
history_key = self._get_history_key(file_path, counter)
103+
content = self.cache.get(history_key)
104+
if content is not None:
105+
history.append(content)
106+
107+
return history
108+
109+
def get_metadata(self, file_path: Path):
110+
"""Get metadata for a file (for testing purposes)."""
111+
metadata_key = self._get_metadata_key(file_path)
112+
return self.cache.get(metadata_key, {'entries': [], 'counter': 0})
Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Compatibility layer for tree-sitter 0.24.0."""
22

33
import importlib
4+
45
from tree_sitter import Language, Parser
56

67
# Cache of loaded languages
@@ -11,13 +12,13 @@ def get_parser(language):
1112
"""Get a Parser object for the given language name."""
1213
if language not in _language_cache:
1314
# Try to import the language module
14-
module_name = f"tree_sitter_{language}"
15+
module_name = f'tree_sitter_{language}'
1516
try:
1617
module = importlib.import_module(module_name)
1718
_language_cache[language] = Language(module.language())
1819
except ImportError:
1920
raise ValueError(
20-
f"Language {language} is not supported. Please install {module_name} package."
21+
f'Language {language} is not supported. Please install {module_name} package.'
2122
)
2223

23-
return Parser(_language_cache[language])
24+
return Parser(_language_cache[language])

0 commit comments

Comments
 (0)