feat: add decodeAsyncGenerator and support any iterable input

niieani · niieani · commit 380f229e25cd · 2023-04-15T16:58:03.000-07:00
diff --git a/encoder.js b/encoder.js
@@ -165,48 +165,82 @@ function encode(text) {
 }
 
 /**
- * @param {number[]} tokens
+ * @param {number} token
+ * @param {number[]} buffer
+ * @returns {string | undefined}
+ */
+function decodeToken(token, buffer) {
+  const decodedToken = decoder[token];
+  if (typeof decodedToken === "undefined") {
+    return;
+  }
+  const decodedBytes = splitToken(decodedToken).map((x) => byteDecoder.get(x));
+
+  let decodedString = "";
+  for (const byte of decodedBytes) {
+    if (typeof byte === "undefined") {
+      continue;
+    }
+    buffer.push(byte);
+    const utf16str = textDecoder.decode(new Uint8Array(buffer), {
+      stream: true,
+    });
+
+    // Check if the last character is a high surrogate
+    const lastCharCode = utf16str.charCodeAt(utf16str.length - 1);
+    if (
+      utf16str.length > 0 &&
+      lastCharCode >= 0xd8_00 &&
+      lastCharCode <= 0xdb_ff
+    ) {
+      // Keep the high surrogate in the buffer and continue with the next token
+      continue;
+    } else {
+      decodedString += utf16str;
+      // reset buffer
+      buffer.length = 0;
+    }
+  }
+  if (decodedString.length > 0) {
+    return decodedString;
+  }
+}
+
+/**
+ * @param {Iterable<number>} tokens
  * @returns {Generator<string, void, undefined>}
  */
 function* decodeGenerator(tokens) {
   /** @type {number[]} */
-  let buffer = [];
+  const buffer = [];
 
   for (const token of tokens) {
-    const decodedToken = decoder[token];
-    if (typeof decodedToken === "undefined") {
-      continue;
+    const result = decodeToken(token, buffer);
+    if (typeof result !== "undefined") {
+      yield result;
     }
-    const decodedBytes = splitToken(decodedToken).map((x) =>
-      byteDecoder.get(x),
-    );
-
-    let decodedString = "";
-    for (const byte of decodedBytes) {
-      if (typeof byte === "undefined") {
-        continue;
-      }
-      buffer.push(byte);
-      const utf16str = textDecoder.decode(new Uint8Array(buffer), {
-        stream: true,
-      });
-
-      // Check if the last character is a high surrogate
-      const lastCharCode = utf16str.charCodeAt(utf16str.length - 1);
-      if (
-        utf16str.length > 0 &&
-        lastCharCode >= 0xd8_00 &&
-        lastCharCode <= 0xdb_ff
-      ) {
-        // Keep the high surrogate in the buffer and continue with the next token
-        continue;
-      } else {
-        decodedString += utf16str;
-        buffer = [];
-      }
-    }
-    if (decodedString.length > 0) {
-      yield decodedString;
+  }
+
+  // Yield any remaining characters in the buffer
+  if (buffer.length > 0) {
+    yield textDecoder.decode(new Uint8Array(buffer));
+  }
+}
+
+/**
+ * Decode tokens asynchronously and yield the decoded strings, one by one.
+ * Will not yield for tokens that include a high surrogate, but wait for the next token.
+ * @param {AsyncIterable<number>} tokens
+ * @returns {AsyncGenerator<string, void, undefined>}
+ */
+async function* decodeAsyncGenerator(tokens) {
+  /** @type {number[]} */
+  const buffer = [];
+
+  for await (const token of tokens) {
+    const result = decodeToken(token, buffer);
+    if (typeof result !== "undefined") {
+      yield result;
     }
   }
 
@@ -228,4 +262,5 @@ module.exports.encode = encode;
 module.exports.decode = decode;
 module.exports.encodeGenerator = encodeGenerator;
 module.exports.decodeGenerator = decodeGenerator;
+module.exports.decodeAsyncGenerator = decodeAsyncGenerator;
 module.exports.isWithinTokenLimit = isWithinTokenLimit;
diff --git a/encoder.test.js b/encoder.test.js
@@ -3,6 +3,7 @@ const {
   encode,
   isWithinTokenLimit,
   decodeGenerator,
+  decodeAsyncGenerator,
 } = require("./encoder");
 
 test("empty string", () => {
@@ -42,16 +43,18 @@ test("multi-token word", () => {
   expect(isWithinTokenLimit(str, 3)).toEqual(3);
 });
 
+const helloWorldTokens = [31373, 50169, 233, 995, 12520, 234, 235];
+
 test("emojis", () => {
   const str = "hello 👋 world 🌍";
-  expect(encode(str)).toEqual([31373, 50169, 233, 995, 12520, 234, 235]);
+  expect(encode(str)).toEqual(helloWorldTokens);
   expect(decode(encode(str))).toEqual(str);
   expect(isWithinTokenLimit(str, 4)).toEqual(false);
   expect(isWithinTokenLimit(str, 400)).toEqual(7);
 });
 
 test("decode token-by-token via generator", () => {
-  const generator = decodeGenerator([31373, 50169, 233, 995, 12520, 234, 235]);
+  const generator = decodeGenerator(helloWorldTokens);
   expect(generator.next().value).toEqual("hello");
   expect(generator.next().value).toEqual(" ");
   expect(generator.next().value).toEqual("👋");
@@ -60,6 +63,20 @@ test("decode token-by-token via generator", () => {
   expect(generator.next().value).toEqual("🌍");
 });
 
+async function* getHelloWorldTokensAsync() {
+  for (const token of helloWorldTokens) {
+    yield await Promise.resolve(token);
+  }
+}
+
+test("decode token-by-token via async generator", async () => {
+  const generator = decodeAsyncGenerator(getHelloWorldTokensAsync());
+  const decoded = ["hello", " ", "👋", " world", " ", "🌍"];
+  for await (const value of generator) {
+    expect(value).toEqual(decoded.shift());
+  }
+});
+
 test("properties of Object", () => {
   const str = "toString constructor hasOwnProperty valueOf";
 
@@ -72,28 +89,10 @@ test("properties of Object", () => {
 test("text with commas", () => {
   const str = "hello, I am a text, and I have commas. a,b,c";
   expect(decode(encode(str))).toEqual(str);
-  expect(encode(str)).toMatchInlineSnapshot(`
-    [
-      31373,
-      11,
-      314,
-      716,
-      257,
-      2420,
-      11,
-      290,
-      314,
-      423,
-      725,
-      292,
-      13,
-      257,
-      11,
-      65,
-      11,
-      66,
-    ]
-  `);
+  expect(encode(str)).toStrictEqual([
+    31373, 11, 314, 716, 257, 2420, 11, 290, 314, 423, 725, 292, 13, 257, 11,
+    65, 11, 66,
+  ]);
   expect(isWithinTokenLimit(str, 15)).toEqual(false);
   expect(isWithinTokenLimit(str, 300)).toEqual(18);
 });
diff --git a/tsconfig.json b/tsconfig.json
@@ -22,6 +22,11 @@
     /* Completeness */
     "skipLibCheck": true /* Skip type checking all .d.ts files. */
   },
-  "include": ["*.js", "data/*"],
-  "exclude": ["jest.config.js"]
+  "include": [
+    "*.js",
+    "data/*.js"
+  ],
+  "exclude": [
+    "jest.config.js"
+  ]
 }