@@ -166,64 +166,56 @@ function encode(text) {
166
166
167
167
/**
168
168
* @param {number } token
169
- * @param {number[] } buffer
170
169
* @returns {string | undefined }
171
170
*/
172
- function decodeToken ( token , buffer ) {
171
+ function decodeToken ( token ) {
173
172
const decodedToken = decoder [ token ] ;
174
173
if ( typeof decodedToken === "undefined" ) {
175
- return ;
174
+ return "" ;
176
175
}
177
- const decodedBytes = splitToken ( decodedToken ) . map ( ( x ) => byteDecoder . get ( x ) ) ;
178
-
179
- let decodedString = "" ;
180
- for ( const byte of decodedBytes ) {
181
- if ( typeof byte === "undefined" ) {
182
- continue ;
183
- }
184
- buffer . push ( byte ) ;
185
- const utf16str = textDecoder . decode ( new Uint8Array ( buffer ) , {
186
- stream : true ,
187
- } ) ;
176
+ const decodedBytes = splitToken ( decodedToken ) . map (
177
+ ( x ) => /** @type {number } */ ( byteDecoder . get ( x ) ) ,
178
+ ) ;
179
+ return textDecoder . decode ( new Uint8Array ( decodedBytes ) , {
180
+ stream : true ,
181
+ } ) ;
182
+ }
188
183
189
- // Check if the last character is a high surrogate
190
- const lastCharCode = utf16str . charCodeAt ( utf16str . length - 1 ) ;
191
- if (
192
- utf16str . length > 0 &&
193
- lastCharCode >= 0xd8_00 &&
194
- lastCharCode <= 0xdb_ff
195
- ) {
196
- // Keep the high surrogate in the buffer and continue with the next token
197
- continue ;
198
- } else {
199
- decodedString += utf16str ;
200
- // reset buffer
201
- buffer . length = 0 ;
202
- }
203
- }
204
- if ( decodedString . length > 0 ) {
205
- return decodedString ;
206
- }
184
+ /**
185
+ * @param {string } string
186
+ * @returns {boolean }
187
+ */
188
+ function endsWithIncompleteUtfPairSurrogate ( string ) {
189
+ if ( string . length === 0 ) return false ;
190
+ // Check if the last character is a high surrogate
191
+ const lastCharCode = string . charCodeAt ( string . length - 1 ) ;
192
+ return lastCharCode >= 55296 && lastCharCode <= 56319 ;
207
193
}
208
194
209
195
/**
210
196
* @param {Iterable<number> } tokens
211
197
* @returns {Generator<string, void, undefined> }
212
198
*/
213
199
function * decodeGenerator ( tokens ) {
214
- /** @type {number[] } */
215
- const buffer = [ ] ;
200
+ /** @type {string } */
201
+ let buffer = "" ;
216
202
217
203
for ( const token of tokens ) {
218
- const result = decodeToken ( token , buffer ) ;
219
- if ( typeof result !== "undefined" ) {
220
- yield result ;
204
+ buffer += decodeToken ( token ) ;
205
+
206
+ if ( buffer . length === 0 || endsWithIncompleteUtfPairSurrogate ( buffer ) ) {
207
+ // Keep the high surrogate in the buffer and continue with the next token
208
+ continue ;
209
+ } else {
210
+ yield buffer ;
211
+ // reset buffer
212
+ buffer = "" ;
221
213
}
222
214
}
223
215
224
216
// Yield any remaining characters in the buffer
225
217
if ( buffer . length > 0 ) {
226
- yield textDecoder . decode ( new Uint8Array ( buffer ) ) ;
218
+ yield buffer ;
227
219
}
228
220
}
229
221
@@ -234,19 +226,25 @@ function* decodeGenerator(tokens) {
234
226
* @returns {AsyncGenerator<string, void, undefined> }
235
227
*/
236
228
async function * decodeAsyncGenerator ( tokens ) {
237
- /** @type {number[] } */
238
- const buffer = [ ] ;
229
+ /** @type {string } */
230
+ let buffer = "" ;
239
231
240
232
for await ( const token of tokens ) {
241
- const result = decodeToken ( token , buffer ) ;
242
- if ( typeof result !== "undefined" ) {
243
- yield result ;
233
+ buffer += decodeToken ( token ) ;
234
+
235
+ if ( buffer . length === 0 || endsWithIncompleteUtfPairSurrogate ( buffer ) ) {
236
+ // Keep the high surrogate in the buffer and continue with the next token
237
+ continue ;
238
+ } else {
239
+ yield buffer ;
240
+ // reset buffer
241
+ buffer = "" ;
244
242
}
245
243
}
246
244
247
245
// Yield any remaining characters in the buffer
248
246
if ( buffer . length > 0 ) {
249
- yield textDecoder . decode ( new Uint8Array ( buffer ) ) ;
247
+ yield buffer ;
250
248
}
251
249
}
252
250
0 commit comments