1
1
from dotenv import load_dotenv
2
2
3
3
import cocoindex
4
+ import os
5
+
6
+ class ExtractExtension (cocoindex .op .FunctionSpec ):
7
+ """Summarize a Python module."""
8
+
9
+ @cocoindex .op .executor_class ()
10
+ class ExtractExtensionExecutor :
11
+ """Executor for ExtractExtension."""
12
+
13
+ spec : ExtractExtension
14
+
15
+ def __call__ (self , filename : str ) -> str :
16
+ return os .path .splitext (filename )[1 ]
4
17
5
18
def code_to_embedding (text : cocoindex .DataSlice ) -> cocoindex .DataSlice :
6
19
"""
@@ -17,14 +30,15 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
17
30
"""
18
31
data_scope ["files" ] = flow_builder .add_source (
19
32
cocoindex .sources .LocalFile (path = "../.." ,
20
- included_patterns = ["*.py" ],
21
- excluded_patterns = [".*" ]))
33
+ included_patterns = ["*.py" , "*.rs" , "*.toml" , "*.md" , "*.mdx" ],
34
+ excluded_patterns = [".*" , "target" , "**/node_modules" ]))
22
35
code_embeddings = data_scope .add_collector ()
23
36
24
37
with data_scope ["files" ].row () as file :
38
+ file ["extension" ] = file ["filename" ].transform (ExtractExtension ())
25
39
file ["chunks" ] = file ["content" ].transform (
26
40
cocoindex .functions .SplitRecursively (),
27
- language = "python" , chunk_size = 1000 , chunk_overlap = 300 )
41
+ language = file [ "extension" ] , chunk_size = 1000 , chunk_overlap = 300 )
28
42
with file ["chunks" ].row () as chunk :
29
43
chunk ["embedding" ] = chunk ["text" ].call (code_to_embedding )
30
44
code_embeddings .collect (filename = file ["filename" ], location = chunk ["location" ],
0 commit comments