Skip to content

Commit fb5fe56

Browse files
ryanhoangtczlll
andauthored
Add tools from LocAgent (#108)
* move code from locagent pr * fix test * Fix tree-sitter compatibility issue between LocAgent and OpenHands (#121) * Fix tree-sitter compatibility with version 0.24.0 * update query_file * poetry lock * fix test * fix test & remote ts compat layer * skip on CI * remove tree-siter-languages & use pytest-forked --------- Co-authored-by: Hoang Tran <descience.thh10@gmail.com> * add missing packages as required deps * bump to 0.2.14 --------- Co-authored-by: Zhaoling Chen <1781750070@qq.com>
1 parent 5efbe15 commit fb5fe56

37 files changed

+10223
-364
lines changed

.github/workflows/py-intg-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,4 @@ jobs:
3737
POETRY_VIRTUALENVS_CREATE: false
3838
- name: Run tests
3939
run: |
40-
poetry run pytest ./tests/integration
40+
poetry run pytest ./tests/integration --forked

.github/workflows/py-unit-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,4 @@ jobs:
3232
POETRY_VIRTUALENVS_CREATE: false
3333
- name: Run tests
3434
run: |
35-
poetry run pytest ./tests/unit
35+
poetry run pytest ./tests/unit --forked

dev_config/python/.pre-commit-config.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
exclude: ^openhands_aci/indexing/locagent/
2+
13
repos:
24
- repo: https://github.com/pre-commit/pre-commit-hooks
35
rev: v4.5.0
@@ -38,6 +40,6 @@ repos:
3840
- id: mypy
3941
additional_dependencies:
4042
[types-requests, types-setuptools, types-pyyaml, types-toml, types-cachetools]
41-
entry: mypy --config-file dev_config/python/mypy.ini openhands_aci/
43+
entry: mypy --config-file dev_config/python/mypy.ini openhands_aci/ --exclude 'openhands_aci/indexing/locagent/'
4244
always_run: true
4345
pass_filenames: false
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import libcst as cst
2+
import libcst.matchers as m
3+
4+
5+
class CompressTransformer(cst.CSTTransformer):
6+
DESCRIPTION = str = 'Replaces function body with ...'
7+
replacement_string = '"$$FUNC_BODY_REPLACEMENT_STRING$$"'
8+
9+
def __init__(self, keep_constant=True):
10+
self.keep_constant = keep_constant
11+
12+
def leave_Module(
13+
self, original_node: cst.Module, updated_node: cst.Module
14+
) -> cst.Module:
15+
new_body = [
16+
stmt
17+
for stmt in updated_node.body
18+
if m.matches(stmt, m.ClassDef())
19+
or m.matches(stmt, m.FunctionDef())
20+
or (
21+
self.keep_constant
22+
and m.matches(stmt, m.SimpleStatementLine())
23+
and m.matches(stmt.body[0], m.Assign())
24+
)
25+
]
26+
return updated_node.with_changes(body=new_body)
27+
28+
def leave_ClassDef(
29+
self, original_node: cst.ClassDef, updated_node: cst.ClassDef
30+
) -> cst.ClassDef:
31+
# Remove docstring in the class body
32+
new_body = [
33+
stmt
34+
for stmt in updated_node.body.body
35+
if not (
36+
m.matches(stmt, m.SimpleStatementLine())
37+
and m.matches(stmt.body[0], m.Expr())
38+
and m.matches(stmt.body[0].value, m.SimpleString())
39+
)
40+
]
41+
return updated_node.with_changes(body=cst.IndentedBlock(body=new_body))
42+
43+
def leave_FunctionDef(
44+
self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef
45+
) -> cst.CSTNode:
46+
new_expr = cst.Expr(value=cst.SimpleString(value=self.replacement_string))
47+
new_body = cst.IndentedBlock((new_expr,))
48+
# another way: replace with pass?
49+
return updated_node.with_changes(body=new_body)
50+
51+
52+
def get_skeleton(raw_code, keep_constant: bool = True):
53+
try:
54+
tree = cst.parse_module(raw_code)
55+
except:
56+
return raw_code
57+
58+
transformer = CompressTransformer(keep_constant=keep_constant)
59+
modified_tree = tree.visit(transformer)
60+
code = modified_tree.code
61+
code = code.replace(CompressTransformer.replacement_string + '\n', '...\n')
62+
code = code.replace(CompressTransformer.replacement_string, '...\n')
63+
return code
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .repository import FileRepository
2+
3+
__all__ = ['FileRepository']
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import fnmatch
2+
import mimetypes
3+
import os
4+
from typing import Dict
5+
6+
import Stemmer
7+
from llama_index.core import SimpleDirectoryReader
8+
from llama_index.retrievers.bm25 import BM25Retriever
9+
10+
from .index.epic_split import (
11+
EpicSplitter,
12+
)
13+
14+
15+
def build_code_retriever_from_repo(
16+
repo_path,
17+
similarity_top_k=10,
18+
min_chunk_size=100,
19+
chunk_size=500,
20+
max_chunk_size=2000,
21+
hard_token_limit=2000,
22+
max_chunks=200,
23+
persist_path=None,
24+
show_progress=False,
25+
):
26+
# print(repo_path)
27+
# Only extract file name and type to not trigger unnecessary embedding jobs
28+
def file_metadata_func(file_path: str) -> Dict:
29+
# print(file_path)
30+
file_path = file_path.replace(repo_path, '')
31+
if file_path.startswith('/'):
32+
file_path = file_path[1:]
33+
34+
test_patterns = [
35+
'**/test/**',
36+
'**/tests/**',
37+
'**/test_*.py',
38+
'**/*_test.py',
39+
]
40+
category = (
41+
'test'
42+
if any(fnmatch.fnmatch(file_path, pattern) for pattern in test_patterns)
43+
else 'implementation'
44+
)
45+
46+
return {
47+
'file_path': file_path,
48+
'file_name': os.path.basename(file_path),
49+
'file_type': mimetypes.guess_type(file_path)[0],
50+
'category': category,
51+
}
52+
53+
reader = SimpleDirectoryReader(
54+
input_dir=repo_path,
55+
exclude=[
56+
'**/test/**',
57+
'**/tests/**',
58+
'**/test_*.py',
59+
'**/*_test.py',
60+
],
61+
file_metadata=file_metadata_func,
62+
filename_as_id=True,
63+
required_exts=['.py'], # TODO: Shouldn't be hardcoded and filtered
64+
recursive=True,
65+
)
66+
docs = reader.load_data()
67+
68+
# splitter = CodeSplitter(
69+
# language="python",
70+
# chunk_lines=100, # lines per chunk
71+
# chunk_lines_overlap=15, # lines overlap between chunks
72+
# max_chars=3000, # max chars per chunk
73+
# )
74+
75+
splitter = EpicSplitter(
76+
min_chunk_size=min_chunk_size,
77+
chunk_size=chunk_size,
78+
max_chunk_size=max_chunk_size,
79+
hard_token_limit=hard_token_limit,
80+
max_chunks=max_chunks,
81+
repo_path=repo_path,
82+
)
83+
prepared_nodes = splitter.get_nodes_from_documents(
84+
docs, show_progress=show_progress
85+
)
86+
87+
# We can pass in the index, docstore, or list of nodes to create the retriever
88+
retriever = BM25Retriever.from_defaults(
89+
nodes=prepared_nodes,
90+
similarity_top_k=similarity_top_k,
91+
stemmer=Stemmer.Stemmer('english'),
92+
language='english',
93+
)
94+
if persist_path:
95+
retriever.persist(persist_path)
96+
return retriever
97+
# keyword = 'FORBIDDEN_ALIAS_PATTERN'
98+
# retrieved_nodes = retriever.retrieve(keyword)
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from typing import Optional
2+
3+
from .parser.java import JavaParser
4+
from .parser.parser import CodeParser
5+
from .parser.python import PythonParser
6+
7+
8+
def supports_codeblocks(path: str):
9+
return path.endswith('.py')
10+
11+
12+
def get_parser_by_path(file_path: str) -> Optional[CodeParser]:
13+
if file_path.endswith('.py'):
14+
return PythonParser()
15+
elif file_path.endswith('.java'):
16+
return JavaParser()
17+
else:
18+
return None

0 commit comments

Comments
 (0)