From c662637b7524dc1a4a662847665096afea78c853 Mon Sep 17 00:00:00 2001 From: Kevin Mingtarja <69668484+kevinmingtarja@users.noreply.github.com> Date: Fri, 15 Mar 2024 10:17:44 -0700 Subject: [PATCH] add TokenSet.numSet/toString and tokenRepr in jsctrl (#76) This PR adds support for `TokenSet.numSet/toString` and `tokenRepr` in `jsctrl`. Example: ```js // samples/hello.js async function main() { await $`Ultimate answer is to the life, universe and everything is ` await gen({ regex: /abc^/ }) } start(main) ``` Output: ``` % ../../aici.sh run --build . samples/hello.js ... [0]: FIXED "Ultimate answer is to the life, universe and everything is " [0]: GEN-OPT {regex: /abc^/} [0]: regex constraint: "abc^" [0]: dfa: 244 bytes [0]: ALLOW: TokenSet: 3/50295; "a", "ab", "abc" [0]: GEN-STEP: "a" [0]: ALLOW: TokenSet: 2/50295; "b", "bc" [0]: GEN-STEP: "b" [0]: ALLOW: TokenSet: 1/50295; "c" [0]: GEN-STEP: "c" [0]: ALLOW: TokenSet: 0/50295; [0]: Constraint doesn't allow any tokens; adding EOS [0]: GEN-STEP: EOS [0]: GEN "abc" [0]: JsCtrl: done [DONE] [Response] abc ``` Closes #64 --- controllers/jsctrl/samples/aici-types.d.ts | 14 +++++++++++++- controllers/jsctrl/src/jsctrl.rs | 15 +++++++++++++++ controllers/jsctrl/ts/aici.ts | 7 +++++++ controllers/jsctrl/ts/native.d.ts | 14 +++++++++++++- 4 files changed, 48 insertions(+), 2 deletions(-) diff --git a/controllers/jsctrl/samples/aici-types.d.ts b/controllers/jsctrl/samples/aici-types.d.ts index 1e94e5aa..9b0fe7dd 100644 --- a/controllers/jsctrl/samples/aici-types.d.ts +++ b/controllers/jsctrl/samples/aici-types.d.ts @@ -143,6 +143,11 @@ declare module "_aici" { */ function detokenize(tokens: number[]): Buffer; + /** + * Return debug string representation of a given token index + */ + function tokenRepr(token: number): string; + /** * Return identifier of the current sequence. * Most useful with fork_group parameter in mid_process() callback. @@ -200,13 +205,15 @@ declare module "_aici" { */ constructor(); + toString(): string; + add(t: number): void; delete(t: number): void; has(t: number): boolean; clear(): void; /** - * Number of all tokens (not only in the set). + * Number of all possible tokens (regardless of whether they are in the set or not). */ length: number; @@ -214,6 +221,11 @@ declare module "_aici" { * Include or exclude all tokens from the set. */ setAll(value: boolean): void; + + /** + * Number of tokens in the set. + */ + numSet(): number; } /** diff --git a/controllers/jsctrl/src/jsctrl.rs b/controllers/jsctrl/src/jsctrl.rs index 5e6cf98c..c6894d21 100644 --- a/controllers/jsctrl/src/jsctrl.rs +++ b/controllers/jsctrl/src/jsctrl.rs @@ -125,6 +125,11 @@ impl TokenSet { self.inner.len() } + pub fn toString(&self) -> String { + let trie = &mut GLOBAL_STATE.lock().unwrap().trie; + trie.token_set_dbg(&self.inner) + } + pub fn add(&mut self, tok: u32) { self.inner.allow_token(tok); } @@ -144,6 +149,10 @@ impl TokenSet { pub fn setAll(&mut self, val: bool) { self.inner.set_all(val); } + + pub fn numSet(&self) -> usize { + self.inner.num_set() + } } impl Default for TokenSet { @@ -261,6 +270,12 @@ mod aici_mod { Buffer(bytes) } + #[rquickjs::function] + pub fn tokenRepr(token: TokenId) -> String { + let trie = &mut GLOBAL_STATE.lock().unwrap().trie; + trie.token_dbg(token) + } + #[rquickjs::function] pub fn getVar(name: String) -> Option { let name = name.as_str(); diff --git a/controllers/jsctrl/ts/aici.ts b/controllers/jsctrl/ts/aici.ts index da61e9a1..1f0be610 100644 --- a/controllers/jsctrl/ts/aici.ts +++ b/controllers/jsctrl/ts/aici.ts @@ -340,6 +340,11 @@ export class ConstrainedToken extends NextToken { this._constraint = this.mkConstraint(); } this._constraint.allowTokens(bias); + console.log("ALLOW:", bias.toString()); + if (bias.numSet() === 0) { + console.log("Constraint doesn't allow any tokens; adding EOS") + return MidProcessResult.stop(); + } return MidProcessResult.bias(bias); } @@ -677,6 +682,8 @@ export async function genTokens(options: GenOptions): Promise { const tokens = await next_token.run(); res.push(...tokens); + console.log("GEN-STEP:", tokens.map(t => _aici.tokenRepr(t)).join(", ")); + const text = detokenize(res).decode(); if (stopAt !== undefined && text.includes(stopAt)) { diff --git a/controllers/jsctrl/ts/native.d.ts b/controllers/jsctrl/ts/native.d.ts index 2d0413e5..60c7840a 100644 --- a/controllers/jsctrl/ts/native.d.ts +++ b/controllers/jsctrl/ts/native.d.ts @@ -143,6 +143,11 @@ declare module "_aici" { */ function detokenize(tokens: number[]): Buffer; + /** + * Return debug string representation of a given token index + */ + function tokenRepr(token: number): string; + /** * Return identifier of the current sequence. * Most useful with fork_group parameter in mid_process() callback. @@ -200,13 +205,15 @@ declare module "_aici" { */ constructor(); + toString(): string; + add(t: number): void; delete(t: number): void; has(t: number): boolean; clear(): void; /** - * Number of all tokens (not only in the set). + * Number of all possible tokens (regardless of whether they are in the set or not). */ length: number; @@ -214,6 +221,11 @@ declare module "_aici" { * Include or exclude all tokens from the set. */ setAll(value: boolean): void; + + /** + * Number of tokens in the set. + */ + numSet(): number; } /**