Skip to content

Commit 9d800de

Browse files
committed
refactor: simplify decoding logic
1 parent 380f229 commit 9d800de

File tree

1 file changed

+42
-44
lines changed

1 file changed

+42
-44
lines changed

encoder.js

+42-44
Original file line numberDiff line numberDiff line change
@@ -166,64 +166,56 @@ function encode(text) {
166166

167167
/**
168168
* @param {number} token
169-
* @param {number[]} buffer
170169
* @returns {string | undefined}
171170
*/
172-
function decodeToken(token, buffer) {
171+
function decodeToken(token) {
173172
const decodedToken = decoder[token];
174173
if (typeof decodedToken === "undefined") {
175-
return;
174+
return "";
176175
}
177-
const decodedBytes = splitToken(decodedToken).map((x) => byteDecoder.get(x));
178-
179-
let decodedString = "";
180-
for (const byte of decodedBytes) {
181-
if (typeof byte === "undefined") {
182-
continue;
183-
}
184-
buffer.push(byte);
185-
const utf16str = textDecoder.decode(new Uint8Array(buffer), {
186-
stream: true,
187-
});
176+
const decodedBytes = splitToken(decodedToken).map(
177+
(x) => /** @type {number} */ (byteDecoder.get(x)),
178+
);
179+
return textDecoder.decode(new Uint8Array(decodedBytes), {
180+
stream: true,
181+
});
182+
}
188183

189-
// Check if the last character is a high surrogate
190-
const lastCharCode = utf16str.charCodeAt(utf16str.length - 1);
191-
if (
192-
utf16str.length > 0 &&
193-
lastCharCode >= 0xd8_00 &&
194-
lastCharCode <= 0xdb_ff
195-
) {
196-
// Keep the high surrogate in the buffer and continue with the next token
197-
continue;
198-
} else {
199-
decodedString += utf16str;
200-
// reset buffer
201-
buffer.length = 0;
202-
}
203-
}
204-
if (decodedString.length > 0) {
205-
return decodedString;
206-
}
184+
/**
185+
* @param {string} string
186+
* @returns {boolean}
187+
*/
188+
function endsWithIncompleteUtfPairSurrogate(string) {
189+
if (string.length === 0) return false;
190+
// Check if the last character is a high surrogate
191+
const lastCharCode = string.charCodeAt(string.length - 1);
192+
return lastCharCode >= 55296 && lastCharCode <= 56319;
207193
}
208194

209195
/**
210196
* @param {Iterable<number>} tokens
211197
* @returns {Generator<string, void, undefined>}
212198
*/
213199
function* decodeGenerator(tokens) {
214-
/** @type {number[]} */
215-
const buffer = [];
200+
/** @type {string} */
201+
let buffer = "";
216202

217203
for (const token of tokens) {
218-
const result = decodeToken(token, buffer);
219-
if (typeof result !== "undefined") {
220-
yield result;
204+
buffer += decodeToken(token);
205+
206+
if (buffer.length === 0 || endsWithIncompleteUtfPairSurrogate(buffer)) {
207+
// Keep the high surrogate in the buffer and continue with the next token
208+
continue;
209+
} else {
210+
yield buffer;
211+
// reset buffer
212+
buffer = "";
221213
}
222214
}
223215

224216
// Yield any remaining characters in the buffer
225217
if (buffer.length > 0) {
226-
yield textDecoder.decode(new Uint8Array(buffer));
218+
yield buffer;
227219
}
228220
}
229221

@@ -234,19 +226,25 @@ function* decodeGenerator(tokens) {
234226
* @returns {AsyncGenerator<string, void, undefined>}
235227
*/
236228
async function* decodeAsyncGenerator(tokens) {
237-
/** @type {number[]} */
238-
const buffer = [];
229+
/** @type {string} */
230+
let buffer = "";
239231

240232
for await (const token of tokens) {
241-
const result = decodeToken(token, buffer);
242-
if (typeof result !== "undefined") {
243-
yield result;
233+
buffer += decodeToken(token);
234+
235+
if (buffer.length === 0 || endsWithIncompleteUtfPairSurrogate(buffer)) {
236+
// Keep the high surrogate in the buffer and continue with the next token
237+
continue;
238+
} else {
239+
yield buffer;
240+
// reset buffer
241+
buffer = "";
244242
}
245243
}
246244

247245
// Yield any remaining characters in the buffer
248246
if (buffer.length > 0) {
249-
yield textDecoder.decode(new Uint8Array(buffer));
247+
yield buffer;
250248
}
251249
}
252250

0 commit comments

Comments
 (0)