All-Hands-AI
diff --git a/‎.github/workflows/py-intg-tests.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/py-intg-tests.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/py-unit-tests.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/py-unit-tests.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev_config/python/.pre-commit-config.yaml
Lines changed: 3 additions & 1 deletion b/‎dev_config/python/.pre-commit-config.yaml
Lines changed: 3 additions & 1 deletion
diff --git a/‎openhands_aci/indexing/locagent/compress.py
Lines changed: 63 additions & 0 deletions b/‎openhands_aci/indexing/locagent/compress.py
Lines changed: 63 additions & 0 deletions
diff --git a/‎openhands_aci/indexing/locagent/repo/chunk_index/__init__.py
Lines changed: 3 additions & 0 deletions b/‎openhands_aci/indexing/locagent/repo/chunk_index/__init__.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎openhands_aci/indexing/locagent/repo/chunk_index/code_retriever.py
Lines changed: 98 additions & 0 deletions b/‎openhands_aci/indexing/locagent/repo/chunk_index/code_retriever.py
Lines changed: 98 additions & 0 deletions
diff --git a/‎openhands_aci/indexing/locagent/repo/chunk_index/codeblocks/__init__.py
Lines changed: 18 additions & 0 deletions b/‎openhands_aci/indexing/locagent/repo/chunk_index/codeblocks/__init__.py
Lines changed: 18 additions & 0 deletions
@@ -37,4 +37,4 @@ jobs:
           POETRY_VIRTUALENVS_CREATE: false
       - name: Run tests
         run: |
-          poetry run pytest ./tests/integration
+          poetry run pytest ./tests/integration --forked
@@ -32,4 +32,4 @@ jobs:
           POETRY_VIRTUALENVS_CREATE: false
       - name: Run tests
         run: |
-          poetry run pytest ./tests/unit
+          poetry run pytest ./tests/unit --forked
@@ -1,3 +1,5 @@
+exclude: ^openhands_aci/indexing/locagent/
+
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.5.0
@@ -38,6 +40,6 @@ repos:
       - id: mypy
         additional_dependencies:
           [types-requests, types-setuptools, types-pyyaml, types-toml, types-cachetools]
-        entry: mypy --config-file dev_config/python/mypy.ini openhands_aci/
+        entry: mypy --config-file dev_config/python/mypy.ini openhands_aci/ --exclude 'openhands_aci/indexing/locagent/'
         always_run: true
         pass_filenames: false
@@ -0,0 +1,63 @@
+import libcst as cst
+import libcst.matchers as m
+
+
+class CompressTransformer(cst.CSTTransformer):
+    DESCRIPTION = str = 'Replaces function body with ...'
+    replacement_string = '"$$FUNC_BODY_REPLACEMENT_STRING$$"'
+
+    def __init__(self, keep_constant=True):
+        self.keep_constant = keep_constant
+
+    def leave_Module(
+        self, original_node: cst.Module, updated_node: cst.Module
+    ) -> cst.Module:
+        new_body = [
+            stmt
+            for stmt in updated_node.body
+            if m.matches(stmt, m.ClassDef())
+            or m.matches(stmt, m.FunctionDef())
+            or (
+                self.keep_constant
+                and m.matches(stmt, m.SimpleStatementLine())
+                and m.matches(stmt.body[0], m.Assign())
+            )
+        ]
+        return updated_node.with_changes(body=new_body)
+
+    def leave_ClassDef(
+        self, original_node: cst.ClassDef, updated_node: cst.ClassDef
+    ) -> cst.ClassDef:
+        # Remove docstring in the class body
+        new_body = [
+            stmt
+            for stmt in updated_node.body.body
+            if not (
+                m.matches(stmt, m.SimpleStatementLine())
+                and m.matches(stmt.body[0], m.Expr())
+                and m.matches(stmt.body[0].value, m.SimpleString())
+            )
+        ]
+        return updated_node.with_changes(body=cst.IndentedBlock(body=new_body))
+
+    def leave_FunctionDef(
+        self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef
+    ) -> cst.CSTNode:
+        new_expr = cst.Expr(value=cst.SimpleString(value=self.replacement_string))
+        new_body = cst.IndentedBlock((new_expr,))
+        # another way: replace with pass?
+        return updated_node.with_changes(body=new_body)
+
+
+def get_skeleton(raw_code, keep_constant: bool = True):
+    try:
+        tree = cst.parse_module(raw_code)
+    except:
+        return raw_code
+
+    transformer = CompressTransformer(keep_constant=keep_constant)
+    modified_tree = tree.visit(transformer)
+    code = modified_tree.code
+    code = code.replace(CompressTransformer.replacement_string + '\n', '...\n')
+    code = code.replace(CompressTransformer.replacement_string, '...\n')
+    return code
@@ -0,0 +1,3 @@
+from .repository import FileRepository
+
+__all__ = ['FileRepository']
@@ -0,0 +1,98 @@
+import fnmatch
+import mimetypes
+import os
+from typing import Dict
+
+import Stemmer
+from llama_index.core import SimpleDirectoryReader
+from llama_index.retrievers.bm25 import BM25Retriever
+
+from .index.epic_split import (
+    EpicSplitter,
+)
+
+
+def build_code_retriever_from_repo(
+    repo_path,
+    similarity_top_k=10,
+    min_chunk_size=100,
+    chunk_size=500,
+    max_chunk_size=2000,
+    hard_token_limit=2000,
+    max_chunks=200,
+    persist_path=None,
+    show_progress=False,
+):
+    # print(repo_path)
+    # Only extract file name and type to not trigger unnecessary embedding jobs
+    def file_metadata_func(file_path: str) -> Dict:
+        # print(file_path)
+        file_path = file_path.replace(repo_path, '')
+        if file_path.startswith('/'):
+            file_path = file_path[1:]
+
+        test_patterns = [
+            '**/test/**',
+            '**/tests/**',
+            '**/test_*.py',
+            '**/*_test.py',
+        ]
+        category = (
+            'test'
+            if any(fnmatch.fnmatch(file_path, pattern) for pattern in test_patterns)
+            else 'implementation'
+        )
+
+        return {
+            'file_path': file_path,
+            'file_name': os.path.basename(file_path),
+            'file_type': mimetypes.guess_type(file_path)[0],
+            'category': category,
+        }
+
+    reader = SimpleDirectoryReader(
+        input_dir=repo_path,
+        exclude=[
+            '**/test/**',
+            '**/tests/**',
+            '**/test_*.py',
+            '**/*_test.py',
+        ],
+        file_metadata=file_metadata_func,
+        filename_as_id=True,
+        required_exts=['.py'],  # TODO: Shouldn't be hardcoded and filtered
+        recursive=True,
+    )
+    docs = reader.load_data()
+
+    # splitter = CodeSplitter(
+    #     language="python",
+    #     chunk_lines=100,  # lines per chunk
+    #     chunk_lines_overlap=15,  # lines overlap between chunks
+    #     max_chars=3000,  # max chars per chunk
+    # )
+
+    splitter = EpicSplitter(
+        min_chunk_size=min_chunk_size,
+        chunk_size=chunk_size,
+        max_chunk_size=max_chunk_size,
+        hard_token_limit=hard_token_limit,
+        max_chunks=max_chunks,
+        repo_path=repo_path,
+    )
+    prepared_nodes = splitter.get_nodes_from_documents(
+        docs, show_progress=show_progress
+    )
+
+    # We can pass in the index, docstore, or list of nodes to create the retriever
+    retriever = BM25Retriever.from_defaults(
+        nodes=prepared_nodes,
+        similarity_top_k=similarity_top_k,
+        stemmer=Stemmer.Stemmer('english'),
+        language='english',
+    )
+    if persist_path:
+        retriever.persist(persist_path)
+    return retriever
+    # keyword = 'FORBIDDEN_ALIAS_PATTERN'
+    # retrieved_nodes = retriever.retrieve(keyword)
@@ -0,0 +1,18 @@
+from typing import Optional
+
+from .parser.java import JavaParser
+from .parser.parser import CodeParser
+from .parser.python import PythonParser
+
+
+def supports_codeblocks(path: str):
+    return path.endswith('.py')
+
+
+def get_parser_by_path(file_path: str) -> Optional[CodeParser]:
+    if file_path.endswith('.py'):
+        return PythonParser()
+    elif file_path.endswith('.java'):
+        return JavaParser()
+    else:
+        return None
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .repository import FileRepository`
	`2`	`+`
	`3`	`+__all__ = ['FileRepository']`