Skip to content

Commit ef519a2

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Add regex unit tests and enable shared linkage in fbcode (#78)
Summary: Pull Request resolved: #78 As titled. After some investigation it looks like we need `--dynamic-list` to expose the weak symbol `create_fallback_regex` in `regex.so`. Without `--dynamic-list`, linker will only look for locally defined symbol (the weak implementation) and ignore the strong implementation. This way, when linking `regex_lookahead.so` the exported dynamic symbol can be bound to the strong implementation in `regex_lookahead.cpp`. This makes the unit tests work in fbcode, where by default we are using `mode/dev` which links everything in shared linkage. If using `mode/opt` we don't need `--dynamic-list` because the linking order doesn't matter. Differential Revision: D75391108
1 parent be07807 commit ef519a2

File tree

4 files changed

+59
-2
lines changed

4 files changed

+59
-2
lines changed

include/pytorch/tokenizers/regex.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern);
5555
* @param pattern The regex pattern to compile.
5656
* @return A unique pointer to an IRegex-compatible object.
5757
*/
58-
Result<std::unique_ptr<IRegex>> create_fallback_regex(
59-
const std::string& pattern) TK_WEAK;
58+
TK_WEAK Result<std::unique_ptr<IRegex>> create_fallback_regex(
59+
const std::string& pattern);
6060

6161
} // namespace tokenizers

src/dynamic_functions_list.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
extern "C++" {
3+
*create_fallback_regex*;
4+
};
5+
};

targets.bzl

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,19 @@ def define_common_targets():
2323
platforms = PLATFORMS,
2424
)
2525

26+
# In fbcode we use a dynamic list to ensure that the dynamic linker doesn't
27+
# always bind to the local defined weak symbols. This is telling the linker
28+
# to export the list of dynamic symbols that are defined in the library and
29+
# when later linked with another shared library, the linker will bind to the
30+
# strong symbols in the other library. This is needed in mode/dev where all
31+
# libraries are built into shared libraries.
32+
# See https://sourceware.org/binutils/docs-2.36/ld/Options.html for more info
33+
# on --dynamic-list.
34+
runtime.export_file(
35+
name = "dynamic_functions_list",
36+
src = "src/dynamic_functions_list.txt",
37+
)
38+
2639
runtime.cxx_library(
2740
name = "regex",
2841
srcs = [
@@ -35,6 +48,9 @@ def define_common_targets():
3548
exported_external_deps = [
3649
"re2",
3750
],
51+
linker_flags = [
52+
"--dynamic-list=$(location :dynamic_functions_list)",
53+
],
3854
visibility = ["//pytorch/tokenizers/..."],
3955
header_namespace = "",
4056
platforms = PLATFORMS,
@@ -49,10 +65,12 @@ def define_common_targets():
4965
],
5066
exported_deps = [
5167
":headers",
68+
":regex",
5269
],
5370
exported_external_deps = [
5471
"pcre2",
5572
],
73+
link_whole = True,
5674
visibility = [
5775
"@EXECUTORCH_CLIENTS",
5876
"//pytorch/tokenizers/...",
@@ -119,6 +137,29 @@ def define_common_targets():
119137
platforms = PLATFORMS,
120138
)
121139

140+
runtime.cxx_library(
141+
name = "tiktoken_lookahead",
142+
srcs = [
143+
"src/tiktoken.cpp",
144+
],
145+
deps = [
146+
":regex",
147+
":regex_lookahead",
148+
],
149+
exported_deps = [
150+
":bpe_tokenizer_base",
151+
":headers",
152+
],
153+
exported_external_deps = [
154+
"re2",
155+
],
156+
visibility = [
157+
"@EXECUTORCH_CLIENTS",
158+
"//pytorch/tokenizers/...",
159+
],
160+
platforms = PLATFORMS,
161+
)
162+
122163
runtime.cxx_library(
123164
name = "hf_tokenizer",
124165
srcs = [

test/targets.bzl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,17 @@ def define_common_targets():
9696
],
9797
)
9898

99+
runtime.cxx_test(
100+
name = "test_regex",
101+
srcs = [
102+
"test_regex.cpp",
103+
],
104+
deps = [
105+
"//pytorch/tokenizers:regex_lookahead",
106+
"//pytorch/tokenizers:headers",
107+
],
108+
)
109+
99110
runtime.filegroup(
100111
name = "resources",
101112
srcs = native.glob([

0 commit comments

Comments
 (0)