Skip to content

Commit 380f229

Browse files
committed
feat: add decodeAsyncGenerator and support any iterable input
1 parent 405e35c commit 380f229

File tree

3 files changed

+100
-61
lines changed

3 files changed

+100
-61
lines changed

encoder.js

+70-35
Original file line numberDiff line numberDiff line change
@@ -165,48 +165,82 @@ function encode(text) {
165165
}
166166

167167
/**
168-
* @param {number[]} tokens
168+
* @param {number} token
169+
* @param {number[]} buffer
170+
* @returns {string | undefined}
171+
*/
172+
function decodeToken(token, buffer) {
173+
const decodedToken = decoder[token];
174+
if (typeof decodedToken === "undefined") {
175+
return;
176+
}
177+
const decodedBytes = splitToken(decodedToken).map((x) => byteDecoder.get(x));
178+
179+
let decodedString = "";
180+
for (const byte of decodedBytes) {
181+
if (typeof byte === "undefined") {
182+
continue;
183+
}
184+
buffer.push(byte);
185+
const utf16str = textDecoder.decode(new Uint8Array(buffer), {
186+
stream: true,
187+
});
188+
189+
// Check if the last character is a high surrogate
190+
const lastCharCode = utf16str.charCodeAt(utf16str.length - 1);
191+
if (
192+
utf16str.length > 0 &&
193+
lastCharCode >= 0xd8_00 &&
194+
lastCharCode <= 0xdb_ff
195+
) {
196+
// Keep the high surrogate in the buffer and continue with the next token
197+
continue;
198+
} else {
199+
decodedString += utf16str;
200+
// reset buffer
201+
buffer.length = 0;
202+
}
203+
}
204+
if (decodedString.length > 0) {
205+
return decodedString;
206+
}
207+
}
208+
209+
/**
210+
* @param {Iterable<number>} tokens
169211
* @returns {Generator<string, void, undefined>}
170212
*/
171213
function* decodeGenerator(tokens) {
172214
/** @type {number[]} */
173-
let buffer = [];
215+
const buffer = [];
174216

175217
for (const token of tokens) {
176-
const decodedToken = decoder[token];
177-
if (typeof decodedToken === "undefined") {
178-
continue;
218+
const result = decodeToken(token, buffer);
219+
if (typeof result !== "undefined") {
220+
yield result;
179221
}
180-
const decodedBytes = splitToken(decodedToken).map((x) =>
181-
byteDecoder.get(x),
182-
);
183-
184-
let decodedString = "";
185-
for (const byte of decodedBytes) {
186-
if (typeof byte === "undefined") {
187-
continue;
188-
}
189-
buffer.push(byte);
190-
const utf16str = textDecoder.decode(new Uint8Array(buffer), {
191-
stream: true,
192-
});
193-
194-
// Check if the last character is a high surrogate
195-
const lastCharCode = utf16str.charCodeAt(utf16str.length - 1);
196-
if (
197-
utf16str.length > 0 &&
198-
lastCharCode >= 0xd8_00 &&
199-
lastCharCode <= 0xdb_ff
200-
) {
201-
// Keep the high surrogate in the buffer and continue with the next token
202-
continue;
203-
} else {
204-
decodedString += utf16str;
205-
buffer = [];
206-
}
207-
}
208-
if (decodedString.length > 0) {
209-
yield decodedString;
222+
}
223+
224+
// Yield any remaining characters in the buffer
225+
if (buffer.length > 0) {
226+
yield textDecoder.decode(new Uint8Array(buffer));
227+
}
228+
}
229+
230+
/**
231+
* Decode tokens asynchronously and yield the decoded strings, one by one.
232+
* Will not yield for tokens that include a high surrogate, but wait for the next token.
233+
* @param {AsyncIterable<number>} tokens
234+
* @returns {AsyncGenerator<string, void, undefined>}
235+
*/
236+
async function* decodeAsyncGenerator(tokens) {
237+
/** @type {number[]} */
238+
const buffer = [];
239+
240+
for await (const token of tokens) {
241+
const result = decodeToken(token, buffer);
242+
if (typeof result !== "undefined") {
243+
yield result;
210244
}
211245
}
212246

@@ -228,4 +262,5 @@ module.exports.encode = encode;
228262
module.exports.decode = decode;
229263
module.exports.encodeGenerator = encodeGenerator;
230264
module.exports.decodeGenerator = decodeGenerator;
265+
module.exports.decodeAsyncGenerator = decodeAsyncGenerator;
231266
module.exports.isWithinTokenLimit = isWithinTokenLimit;

encoder.test.js

+23-24
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ const {
33
encode,
44
isWithinTokenLimit,
55
decodeGenerator,
6+
decodeAsyncGenerator,
67
} = require("./encoder");
78

89
test("empty string", () => {
@@ -42,16 +43,18 @@ test("multi-token word", () => {
4243
expect(isWithinTokenLimit(str, 3)).toEqual(3);
4344
});
4445

46+
const helloWorldTokens = [31373, 50169, 233, 995, 12520, 234, 235];
47+
4548
test("emojis", () => {
4649
const str = "hello 👋 world 🌍";
47-
expect(encode(str)).toEqual([31373, 50169, 233, 995, 12520, 234, 235]);
50+
expect(encode(str)).toEqual(helloWorldTokens);
4851
expect(decode(encode(str))).toEqual(str);
4952
expect(isWithinTokenLimit(str, 4)).toEqual(false);
5053
expect(isWithinTokenLimit(str, 400)).toEqual(7);
5154
});
5255

5356
test("decode token-by-token via generator", () => {
54-
const generator = decodeGenerator([31373, 50169, 233, 995, 12520, 234, 235]);
57+
const generator = decodeGenerator(helloWorldTokens);
5558
expect(generator.next().value).toEqual("hello");
5659
expect(generator.next().value).toEqual(" ");
5760
expect(generator.next().value).toEqual("👋");
@@ -60,6 +63,20 @@ test("decode token-by-token via generator", () => {
6063
expect(generator.next().value).toEqual("🌍");
6164
});
6265

66+
async function* getHelloWorldTokensAsync() {
67+
for (const token of helloWorldTokens) {
68+
yield await Promise.resolve(token);
69+
}
70+
}
71+
72+
test("decode token-by-token via async generator", async () => {
73+
const generator = decodeAsyncGenerator(getHelloWorldTokensAsync());
74+
const decoded = ["hello", " ", "👋", " world", " ", "🌍"];
75+
for await (const value of generator) {
76+
expect(value).toEqual(decoded.shift());
77+
}
78+
});
79+
6380
test("properties of Object", () => {
6481
const str = "toString constructor hasOwnProperty valueOf";
6582

@@ -72,28 +89,10 @@ test("properties of Object", () => {
7289
test("text with commas", () => {
7390
const str = "hello, I am a text, and I have commas. a,b,c";
7491
expect(decode(encode(str))).toEqual(str);
75-
expect(encode(str)).toMatchInlineSnapshot(`
76-
[
77-
31373,
78-
11,
79-
314,
80-
716,
81-
257,
82-
2420,
83-
11,
84-
290,
85-
314,
86-
423,
87-
725,
88-
292,
89-
13,
90-
257,
91-
11,
92-
65,
93-
11,
94-
66,
95-
]
96-
`);
92+
expect(encode(str)).toStrictEqual([
93+
31373, 11, 314, 716, 257, 2420, 11, 290, 314, 423, 725, 292, 13, 257, 11,
94+
65, 11, 66,
95+
]);
9796
expect(isWithinTokenLimit(str, 15)).toEqual(false);
9897
expect(isWithinTokenLimit(str, 300)).toEqual(18);
9998
});

tsconfig.json

+7-2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
/* Completeness */
2323
"skipLibCheck": true /* Skip type checking all .d.ts files. */
2424
},
25-
"include": ["*.js", "data/*"],
26-
"exclude": ["jest.config.js"]
25+
"include": [
26+
"*.js",
27+
"data/*.js"
28+
],
29+
"exclude": [
30+
"jest.config.js"
31+
]
2732
}

0 commit comments

Comments
 (0)