1
- import { expect } from "chai" ;
2
-
3
- import { tokenize } from "./text.js" ;
4
- import { AutoTokenizer } from "@xenova/transformers" ;
5
- import { Repeat } from "immutable" ;
6
-
7
- describe ( "text processing" , ( ) => {
8
- const text = [
9
- "Hello world, a bc 1 2345, '? 976. Wikipedia is a free content online encyclopedia" ,
10
- "written and maintained by a community \n of volunteers, known as Wikipedians." ,
11
- "Founded by Jimmy Wales and Larry Sanger on January 15, 2001, Wikipedia is hosted by the" ,
12
- "Wikimedia Foundation, an American nonprofit organization that employs a staff of over 700 people.[7]"
13
- ] . join ( " " ) ;
14
-
15
- const expectedTokens = [
16
- 15496 , 995 , 11 , 257 , 47125 , 352 , 2242 , 2231 , 11 , 705 , 30 , 860 , 4304 , 13 ,
17
- 15312 , 318 , 257 , 1479 , 2695 , 2691 , 45352 , 3194 , 290 , 9456 , 416 , 257 , 2055 ,
18
- 220 , 198 , 286 , 11661 , 11 , 1900 , 355 , 11145 , 46647 , 1547 , 13 , 4062 , 276 , 416 ,
19
- 12963 , 11769 , 290 , 13633 , 311 , 2564 , 319 , 3269 , 1315 , 11 , 5878 , 11 , 15312 ,
20
- 318 , 12007 , 416 , 262 , 44877 , 5693 , 11 , 281 , 1605 , 15346 , 4009 , 326 , 24803 ,
21
- 257 , 3085 , 286 , 625 , 13037 , 661 , 3693 , 22 , 60 ,
22
- ] ;
23
-
24
- const shortText = 'import { AutoTokenizer } from "@xenova/transformers";'
25
- // with GPT 2 tokenizer
26
- const shortExpectedTokens = [
27
- 11748 , 1391 , 11160 , 30642 , 7509 , 1782 , 422 ,
28
- 44212 , 87 , 268 , 10071 , 14 , 35636 , 364 , 8172
29
- ]
30
-
31
- it ( "can tokenize text with the Llama 3 tokenizer" , async ( ) => {
32
- const tokenizer = await AutoTokenizer . from_pretrained ( "Xenova/llama-3-tokenizer" ) ;
33
- // Tokenizer playgrounds aren't consistent: https://github.com/huggingface/transformers.js/issues/1019
34
- // Tokenization with python:
35
- // from transformers import AutoTokenizer
36
- // tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
37
- // tokenizer.encode(text, add_special_tokens=False)
38
- const expectedTokens = [
39
- 9906 , 1917 , 11 , 264 , 18399 , 220 , 16 , 220 , 11727 , 20 , 11 , 32167 ,
40
- 220 , 25208 , 13 , 27685 , 374 , 264 , 1949 , 2262 , 2930 , 83708 , 5439 , 323 , 18908 ,
41
- 555 , 264 , 4029 , 720 , 315 , 23872 , 11 , 3967 , 439 , 119234 , 291 , 5493 , 13 , 78811 ,
42
- 555 , 28933 , 23782 , 323 , 30390 , 328 , 4091 , 389 , 6186 , 220 , 868 , 11 , 220 , 1049 ,
43
- 16 , 11 , 27685 , 374 , 21685 , 555 , 279 , 90940 , 5114 , 11 , 459 , 3778 , 33184 , 7471 ,
44
- 430 , 51242 , 264 , 5687 , 315 , 927 , 220 , 7007 , 1274 , 8032 , 22 , 60
45
- ]
46
- const tokens = tokenize ( tokenizer , text ) ;
47
- expect ( tokens . toArray ( ) ) . to . be . deep . equal ( expectedTokens ) ;
48
- } ) ;
49
-
50
- it ( "can tokenize text with the GPT2 tokenizer" , async ( ) => {
51
- const tokenizer = await AutoTokenizer . from_pretrained ( "Xenova/gpt2" ) ;
52
-
53
- const tokens = tokenize ( tokenizer , text ) ;
54
- expect ( tokens . toArray ( ) ) . to . be . deep . equal ( expectedTokens ) ;
55
- } ) ;
56
-
57
- it ( "truncates until expected length" , async ( ) => {
58
- const tokenizer = await AutoTokenizer . from_pretrained ( "Xenova/gpt2" ) ;
59
-
60
- const tokens = tokenize ( tokenizer , text , { truncation : true , max_length : 10 } ) ;
61
- expect ( tokens . toArray ( ) ) . to . be . deep . equal ( expectedTokens . slice ( 0 , 10 ) ) ;
62
- } ) ;
63
-
64
- it ( "pads sequence until enough token are generated" , async ( ) => {
65
- const tokenizer = await AutoTokenizer . from_pretrained ( "Xenova/gpt2" ) ;
66
- const max_length = 20
67
-
68
- const tokens = tokenize ( tokenizer , shortText , { padding : true , max_length} ) ;
69
- const paddedSequence = Repeat ( tokenizer . pad_token_id , max_length - shortExpectedTokens . length )
70
- . concat ( shortExpectedTokens ) . toArray ( ) ;
71
- expect ( tokens . toArray ( ) ) . to . be . deep . equal ( paddedSequence ) ;
72
- } ) ;
73
-
74
- it ( "can pad on right side" , async ( ) => {
75
- const tokenizer = await AutoTokenizer . from_pretrained ( "Xenova/gpt2" ) ;
76
- const max_length = 20
77
-
78
- const tokens = tokenize ( tokenizer , shortText , { padding : true , padding_side : 'right' , max_length} ) ;
79
- const paddedSequence = shortExpectedTokens . concat (
80
- Repeat ( tokenizer . pad_token_id , max_length - shortExpectedTokens . length ) . toArray ( )
81
- ) ;
82
- expect ( tokens . toArray ( ) ) . to . be . deep . equal ( paddedSequence ) ;
83
- } ) ;
1
+ import { expect } from "chai" ;
2
+
3
+ import { tokenize } from "./text.js" ;
4
+ import { AutoTokenizer } from "@xenova/transformers" ;
5
+ import { Repeat } from "immutable" ;
6
+ import { PreTrainedTokenizer } from "@xenova/transformers" ;
7
+
8
+
9
+ interface TokenizerOutput {
10
+ input_ids : number [ ] ;
11
+ }
12
+
13
+ /**
14
+ * Encodes the text into token IDs and then decodes them back to text
15
+ * Special tokens are skipped during decoding
16
+ *
17
+ * @param tokenizer - An instance of a PreTrainedTokenizer
18
+ * @param text - The text to process
19
+ * @returns The decoded text obtained after encoding and then decoding
20
+ */
21
+ export function encodeDecode ( tokenizer : PreTrainedTokenizer , text : string ) : string {
22
+ // Encode the text using the tokenizer.
23
+ const encoding = tokenizer ( text , { return_tensor : false } ) as TokenizerOutput ;
24
+ // Decode the token IDs back into text while skipping special tokens.
25
+ return tokenizer . decode ( encoding . input_ids , { skip_special_tokens : true } ) ;
26
+ }
27
+
28
+
29
+ describe ( "text processing" , ( ) => {
30
+ const text = [
31
+ "Hello world, a bc 1 2345, '? 976. Wikipedia is a free content online encyclopedia" ,
32
+ "written and maintained by a community \n of volunteers, known as Wikipedians." ,
33
+ "Founded by Jimmy Wales and Larry Sanger on January 15, 2001, Wikipedia is hosted by the" ,
34
+ "Wikimedia Foundation, an American nonprofit organization that employs a staff of over 700 people.[7]"
35
+ ] . join ( " " ) ;
36
+
37
+ const expectedTokens = [
38
+ 15496 , 995 , 11 , 257 , 47125 , 352 , 2242 , 2231 , 11 , 705 , 30 , 860 , 4304 , 13 ,
39
+ 15312 , 318 , 257 , 1479 , 2695 , 2691 , 45352 , 3194 , 290 , 9456 , 416 , 257 , 2055 ,
40
+ 220 , 198 , 286 , 11661 , 11 , 1900 , 355 , 11145 , 46647 , 1547 , 13 , 4062 , 276 , 416 ,
41
+ 12963 , 11769 , 290 , 13633 , 311 , 2564 , 319 , 3269 , 1315 , 11 , 5878 , 11 , 15312 ,
42
+ 318 , 12007 , 416 , 262 , 44877 , 5693 , 11 , 281 , 1605 , 15346 , 4009 , 326 , 24803 ,
43
+ 257 , 3085 , 286 , 625 , 13037 , 661 , 3693 , 22 , 60 ,
44
+ ] ;
45
+
46
+ const shortText = 'import { AutoTokenizer } from "@xenova/transformers";'
47
+ // with GPT 2 tokenizer
48
+ const shortExpectedTokens = [
49
+ 11748 , 1391 , 11160 , 30642 , 7509 , 1782 , 422 ,
50
+ 44212 , 87 , 268 , 10071 , 14 , 35636 , 364 , 8172
51
+ ]
52
+
53
+ it ( "can tokenize text with the Llama 3 tokenizer" , async ( ) => {
54
+ const tokenizer = await AutoTokenizer . from_pretrained ( "Xenova/llama-3-tokenizer" ) ;
55
+ // Tokenizer playgrounds aren't consistent: https://github.com/huggingface/transformers.js/issues/1019
56
+ // Tokenization with python:
57
+ // from transformers import AutoTokenizer
58
+ // tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
59
+ // tokenizer.encode(text, add_special_tokens=False)
60
+ const expectedTokens = [
61
+ 9906 , 1917 , 11 , 264 , 18399 , 220 , 16 , 220 , 11727 , 20 , 11 , 32167 ,
62
+ 220 , 25208 , 13 , 27685 , 374 , 264 , 1949 , 2262 , 2930 , 83708 , 5439 , 323 , 18908 ,
63
+ 555 , 264 , 4029 , 720 , 315 , 23872 , 11 , 3967 , 439 , 119234 , 291 , 5493 , 13 , 78811 ,
64
+ 555 , 28933 , 23782 , 323 , 30390 , 328 , 4091 , 389 , 6186 , 220 , 868 , 11 , 220 , 1049 ,
65
+ 16 , 11 , 27685 , 374 , 21685 , 555 , 279 , 90940 , 5114 , 11 , 459 , 3778 , 33184 , 7471 ,
66
+ 430 , 51242 , 264 , 5687 , 315 , 927 , 220 , 7007 , 1274 , 8032 , 22 , 60
67
+ ]
68
+ const tokens = tokenize ( tokenizer , text ) ;
69
+ expect ( tokens . toArray ( ) ) . to . be . deep . equal ( expectedTokens ) ;
70
+ } ) ;
71
+
72
+ it ( "can tokenize text with the GPT2 tokenizer" , async ( ) => {
73
+ const tokenizer = await AutoTokenizer . from_pretrained ( "Xenova/gpt2" ) ;
74
+
75
+ const tokens = tokenize ( tokenizer , text ) ;
76
+ expect ( tokens . toArray ( ) ) . to . be . deep . equal ( expectedTokens ) ;
77
+ } ) ;
78
+
79
+ it ( "truncates until expected length" , async ( ) => {
80
+ const tokenizer = await AutoTokenizer . from_pretrained ( "Xenova/gpt2" ) ;
81
+
82
+ const tokens = tokenize ( tokenizer , text , { truncation : true , max_length : 10 } ) ;
83
+ expect ( tokens . toArray ( ) ) . to . be . deep . equal ( expectedTokens . slice ( 0 , 10 ) ) ;
84
+ } ) ;
85
+
86
+ it ( "pads sequence until enough token are generated" , async ( ) => {
87
+ const tokenizer = await AutoTokenizer . from_pretrained ( "Xenova/gpt2" ) ;
88
+ const max_length = 20
89
+
90
+ const tokens = tokenize ( tokenizer , shortText , { padding : true , max_length} ) ;
91
+ const paddedSequence = Repeat ( tokenizer . pad_token_id , max_length - shortExpectedTokens . length )
92
+ . concat ( shortExpectedTokens ) . toArray ( ) ;
93
+ expect ( tokens . toArray ( ) ) . to . be . deep . equal ( paddedSequence ) ;
94
+ } ) ;
95
+
96
+ it ( "can pad on right side" , async ( ) => {
97
+ const tokenizer = await AutoTokenizer . from_pretrained ( "Xenova/gpt2" ) ;
98
+ const max_length = 20
99
+
100
+ const tokens = tokenize ( tokenizer , shortText , { padding : true , padding_side : 'right' , max_length} ) ;
101
+ const paddedSequence = shortExpectedTokens . concat (
102
+ Repeat ( tokenizer . pad_token_id , max_length - shortExpectedTokens . length ) . toArray ( )
103
+ ) ;
104
+ expect ( tokens . toArray ( ) ) . to . be . deep . equal ( paddedSequence ) ;
105
+ } ) ;
106
+ } ) ;
107
+
108
+
109
+ describe ( "Multi-Tokenizer Tests" , function ( ) {
110
+ this . timeout ( 20000 ) ;
111
+
112
+ const sampleText = "Hello, world! This is a test string to check tokenization." ;
113
+
114
+ // List of tokenizer names to test
115
+ const tokenizerNames = [
116
+ "Xenova/gpt2" ,
117
+ "Xenova/llama-3-tokenizer" ,
118
+ // "Xenova/bert-base-uncased", // takes too long
119
+ "Xenova/roberta-base" ,
120
+ "Xenova/distilbert-base-uncased"
121
+ ] ;
122
+
123
+ tokenizerNames . forEach ( ( name ) => {
124
+ it ( `should tokenize text using tokenizer "${ name } "` , async ( ) => {
125
+ const tokenizer = await AutoTokenizer . from_pretrained ( name ) ;
126
+ const tokens = tokenize ( tokenizer , sampleText ) ;
127
+ const tokenArray = tokens . toArray ( ) ;
128
+
129
+ // Checks that we got a non-empty array of tokens and that each token is a number.
130
+ expect ( tokenArray ) . to . be . an ( "array" ) . that . is . not . empty ;
131
+ tokenArray . forEach ( ( token ) => {
132
+ expect ( token ) . to . be . a ( "number" ) ;
133
+ } ) ;
134
+ } ) ;
135
+ } ) ;
136
+ } ) ;
137
+
138
+
139
+ describe ( "Encode-Decode tokenization" , function ( ) {
140
+ this . timeout ( 20000 ) ;
141
+
142
+ it ( "should return text close to the original after encode-decode tokenization using GPT2 tokenizer" , async function ( ) {
143
+ // Load the GPT2 tokenizer
144
+ const tokenizer = await AutoTokenizer . from_pretrained ( "Xenova/gpt2" ) ;
145
+ const originalText = "Hello, world! This is a test for encode-decode tokenization." ;
146
+
147
+ // Perform round-trip tokenization
148
+ const decodedText = encodeDecode ( tokenizer , originalText ) ;
149
+
150
+ // Check that the decoded text is almost equal to the original text
151
+ expect ( decodedText ) . to . equal ( originalText ) ;
152
+ } ) ;
84
153
} ) ;
0 commit comments