Skip to content

Commit f28b7df

Browse files
authored
Add support for more languages in SplitRecursively (#144)
#109
1 parent ad7f111 commit f28b7df

File tree

3 files changed

+141
-22
lines changed

3 files changed

+141
-22
lines changed

Cargo.toml

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,34 @@ console-subscriber = "0.4.1"
4949
env_logger = "0.11.7"
5050
reqwest = { version = "0.12.13", features = ["json"] }
5151
async-openai = "0.28.0"
52-
5352
tree-sitter = "0.25.3"
5453
tree-sitter-language = "0.1.5"
55-
tree-sitter-python = "0.23.6"
54+
# Per language tree-sitter parsers
55+
tree-sitter-c = "0.23.4"
56+
tree-sitter-cpp = "0.23.4"
57+
tree-sitter-c-sharp = "0.23.1"
58+
tree-sitter-css = "0.23.2"
59+
tree-sitter-fortran = "0.5.0"
60+
tree-sitter-go = "0.23.4"
61+
tree-sitter-html = "0.23.2"
62+
tree-sitter-java = "0.23.5"
5663
tree-sitter-javascript = "0.23.1"
57-
tree-sitter-typescript = "0.23.2"
64+
tree-sitter-json = "0.24.8"
5865
tree-sitter-md = "0.3.2"
66+
tree-sitter-pascal = "0.10.0"
67+
tree-sitter-php = "0.23.11"
68+
tree-sitter-python = "0.23.6"
69+
tree-sitter-r = "1.1.0"
70+
tree-sitter-ruby = "0.23.1"
71+
tree-sitter-rust = "0.23.2"
72+
tree-sitter-scala = "0.23.4"
73+
tree-sitter-scss = "1.0.0"
74+
tree-sitter-sequel = "0.3.8"
75+
tree-sitter-swift = "0.7.0"
76+
tree-sitter-toml-ng = "0.7.0"
77+
tree-sitter-typescript = "0.23.2"
78+
tree-sitter-xml = "0.7.0"
79+
tree-sitter-yaml = "0.7.0"
80+
5981
globset = "0.4.16"
82+
unicase = "2.8.1"

examples/code_embedding/main.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
1616
Define an example flow that embeds files into a vector database.
1717
"""
1818
data_scope["files"] = flow_builder.add_source(
19-
cocoindex.sources.LocalFile(path="../../python", included_patterns=["**/*.py"]))
20-
19+
cocoindex.sources.LocalFile(path="../..",
20+
included_patterns=["*.py"],
21+
excluded_patterns=[".*"]))
2122
code_embeddings = data_scope.add_collector()
2223

2324
with data_scope["files"].row() as file:

src/ops/functions/split_recursively.rs

Lines changed: 112 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use regex::{Matches, Regex};
44
use std::collections::HashSet;
55
use std::sync::LazyLock;
66
use std::{collections::HashMap, sync::Arc};
7+
use unicase::UniCase;
78

89
use crate::base::field_attrs;
910
use crate::{fields_value, ops::sdk::*};
@@ -31,10 +32,10 @@ struct LanguageConfig {
3132
}
3233

3334
fn add_language<'a>(
34-
output: &'a mut HashMap<&'static str, Arc<LanguageConfig>>,
35+
output: &'a mut HashMap<UniCase<&'static str>, Arc<LanguageConfig>>,
3536
name: &'static str,
3637
aliases: impl IntoIterator<Item = &'static str>,
37-
lang_fn: tree_sitter_language::LanguageFn,
38+
lang_fn: impl Into<tree_sitter::Language>,
3839
terminal_node_kinds: impl IntoIterator<Item = &'a str>,
3940
) {
4041
let tree_sitter_lang: tree_sitter::Language = lang_fn.into();
@@ -58,49 +59,143 @@ fn add_language<'a>(
5859
terminal_node_kind_ids,
5960
});
6061
for name in std::iter::once(name).chain(aliases.into_iter()) {
61-
if output.insert(name, config.clone()).is_some() {
62+
if output.insert(name.into(), config.clone()).is_some() {
6263
panic!("Language `{name}` already exists");
6364
}
6465
}
6566
}
6667

67-
static TREE_SITTER_LANGUAGE_BY_LANG: LazyLock<HashMap<&'static str, Arc<LanguageConfig>>> =
68+
static TREE_SITTER_LANGUAGE_BY_LANG: LazyLock<HashMap<UniCase<&'static str>, Arc<LanguageConfig>>> =
6869
LazyLock::new(|| {
6970
let mut map = HashMap::new();
71+
add_language(&mut map, "C", [".c"], tree_sitter_c::LANGUAGE, []);
7072
add_language(
7173
&mut map,
72-
"Python",
73-
["py", "python"],
74-
tree_sitter_python::LANGUAGE,
74+
"C++",
75+
[".cpp", ".cc", ".cxx", ".h", ".hpp", "cpp"],
76+
tree_sitter_c::LANGUAGE,
77+
[],
78+
);
79+
add_language(
80+
&mut map,
81+
"C#",
82+
[".cs", "cs"],
83+
tree_sitter_c_sharp::LANGUAGE,
84+
[],
85+
);
86+
add_language(&mut map, "CSS", [".css"], tree_sitter_css::LANGUAGE, []);
87+
add_language(
88+
&mut map,
89+
"Fortran",
90+
[".f", ".f90", ".f95", ".f03", "f", "f90", "f95", "f03"],
91+
tree_sitter_fortran::LANGUAGE,
92+
[],
93+
);
94+
add_language(
95+
&mut map,
96+
"Go",
97+
[".go", "golang"],
98+
tree_sitter_go::LANGUAGE,
99+
[],
100+
);
101+
add_language(
102+
&mut map,
103+
"HTML",
104+
[".html", ".htm"],
105+
tree_sitter_html::LANGUAGE,
75106
[],
76107
);
108+
add_language(&mut map, "Java", [".java"], tree_sitter_java::LANGUAGE, []);
77109
add_language(
78110
&mut map,
79111
"JavaScript",
80-
["JS", "js", "Javascript", "javascript"],
112+
[".js", "js"],
81113
tree_sitter_javascript::LANGUAGE,
82114
[],
83115
);
116+
add_language(&mut map, "JSON", [".json"], tree_sitter_json::LANGUAGE, []);
84117
add_language(
85118
&mut map,
86-
"TypeScript",
87-
["TS", "ts", "Typescript", "typescript"],
88-
tree_sitter_typescript::LANGUAGE_TYPESCRIPT,
119+
"Markdown",
120+
[".md", "md"],
121+
tree_sitter_md::LANGUAGE,
122+
["inline"],
123+
);
124+
add_language(
125+
&mut map,
126+
"Pascal",
127+
[".pas", "pas", ".dpr", "dpr", "Delphi"],
128+
tree_sitter_pascal::LANGUAGE,
129+
[],
130+
);
131+
add_language(&mut map, "PHP", [".php"], tree_sitter_php::LANGUAGE_PHP, []);
132+
add_language(
133+
&mut map,
134+
"Python",
135+
[".py"],
136+
tree_sitter_python::LANGUAGE,
137+
[],
138+
);
139+
add_language(&mut map, "R", [".r"], tree_sitter_r::LANGUAGE, []);
140+
add_language(&mut map, "Ruby", [".rb"], tree_sitter_ruby::LANGUAGE, []);
141+
add_language(
142+
&mut map,
143+
"Rust",
144+
[".rs", "rs"],
145+
tree_sitter_rust::LANGUAGE,
146+
[],
147+
);
148+
add_language(
149+
&mut map,
150+
"Scala",
151+
[".scala"],
152+
tree_sitter_scala::LANGUAGE,
153+
[],
154+
);
155+
add_language(
156+
&mut map,
157+
"SCSS",
158+
[".scss"],
159+
tree_sitter_scss::language(),
160+
[],
161+
);
162+
add_language(&mut map, "SQL", [".sql"], tree_sitter_sequel::LANGUAGE, []);
163+
add_language(
164+
&mut map,
165+
"Swift",
166+
[".swift"],
167+
tree_sitter_swift::LANGUAGE,
168+
[],
169+
);
170+
add_language(
171+
&mut map,
172+
"TOML",
173+
[".toml"],
174+
tree_sitter_toml_ng::LANGUAGE,
89175
[],
90176
);
91177
add_language(
92178
&mut map,
93179
"TSX",
94-
["tsx"],
180+
[".tsx"],
95181
tree_sitter_typescript::LANGUAGE_TSX,
96182
[],
97183
);
98184
add_language(
99185
&mut map,
100-
"Markdown",
101-
["md", "markdown"],
102-
tree_sitter_md::LANGUAGE.into(),
103-
["inline"],
186+
"TypeScript",
187+
[".ts", "ts"],
188+
tree_sitter_typescript::LANGUAGE_TYPESCRIPT,
189+
[],
190+
);
191+
add_language(&mut map, "XML", [".xml"], tree_sitter_xml::LANGUAGE_XML, []);
192+
add_language(&mut map, "DTD", [".dtd"], tree_sitter_xml::LANGUAGE_DTD, []);
193+
add_language(
194+
&mut map,
195+
"YAML",
196+
[".yaml", ".yml"],
197+
tree_sitter_yaml::LANGUAGE,
198+
[],
104199
);
105200
map
106201
});
@@ -416,7 +511,7 @@ impl SimpleFunctionExecutor for Executor {
416511
.optional()
417512
.map(|v| anyhow::Ok(v.as_str()?.as_ref()))
418513
.transpose()?
419-
.and_then(|lang| TREE_SITTER_LANGUAGE_BY_LANG.get(lang))
514+
.and_then(|lang| TREE_SITTER_LANGUAGE_BY_LANG.get(&UniCase::new(lang)))
420515
};
421516

422517
let recursive_chunker = RecursiveChunker {

0 commit comments

Comments
 (0)