auto-browse
diff --git a/‎package-lock.json
Lines changed: 15 additions & 2 deletions b/‎package-lock.json
Lines changed: 15 additions & 2 deletions
diff --git a/‎package.json
Lines changed: 4 additions & 3 deletions b/‎package.json
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/auto.ts
Lines changed: 114 additions & 81 deletions b/‎src/auto.ts
Lines changed: 114 additions & 81 deletions
@@ -1,6 +1,6 @@
 {
   "name": "@auto-browse/auto-browse",
-  "version": "0.1.4",
+  "version": "0.1.5",
   "description": "AI-powered browser automation",
   "author": "auto-browse",
   "homepage": "https://www.auto-browse.com/",
@@ -21,10 +21,10 @@
     "format": "prettier --write \"src/**/*.ts\"",
     "lint": "eslint \"src/**/*.ts\"",
     "postinstall": "playwright install --with-deps",
-    "clean": "rm -rf dist playwright-report test-results",
+    "clean": "rm -rf dist",
     "check": "npm run format && npm run lint",
     "prepublishOnly": "npm run clean && npm run build && npm run format && npm run lint",
-    "publish-package": "npm run clean && npm run build && npm run format && npm run lint && npm publish --access public --provenance"
+    "publish-package": "npm run clean && npm run build && npm run format && npm run lint && npm publish --access public"
   },
   "keywords": [
     "playwright",
@@ -55,6 +55,7 @@
     "@playwright/test": "1.52.0-alpha-1743011787000",
     "dotenv": "^16.4.7",
     "playwright": "1.52.0-alpha-1743011787000",
+    "yaml": "^2.7.1",
     "zod": "^3.24.2"
   },
   "devDependencies": {
 
@@ -1,41 +1,52 @@
 import { test as base } from '@playwright/test';
+import { z } from 'zod';
 import { AutoConfig } from './types';
 import { sessionManager, context } from './browser';
 import { createReactAgent } from '@langchain/langgraph/prebuilt';
 import { HumanMessage } from '@langchain/core/messages';
 import { createLLMModel } from './llm';
 import {
-    browser_click,
-    browser_type,
-    browser_get_text,
-    browser_navigate,
-    browser_snapshot,
-    browser_hover,
-    browser_drag,
-    browser_select_option,
-    browser_take_screenshot,
-    browser_go_back,
-    browser_wait,
-    browser_press_key,
-    browser_save_pdf,
-    browser_choose_file,
-    browser_go_forward,
-    browser_assert,
+  browser_click,
+  browser_type,
+  browser_get_text,
+  browser_navigate,
+  browser_snapshot,
+  browser_hover,
+  browser_drag,
+  browser_select_option,
+  browser_take_screenshot,
+  browser_go_back,
+  browser_wait,
+  browser_press_key,
+  browser_save_pdf,
+  browser_choose_file,
+  browser_go_forward,
+  browser_assert,
+  browser_page_assert
 } from './tools';
 
+// Define response schema
+const AutoResponseSchema = z.object({
+  action: z
+    .string()
+    .describe('The type of action performed (assert, click, type, etc)'),
+  error: z.string().describe('Error message if any, empty string if none'),
+  output: z.string().describe('Raw output from the action')
+});
+
 // Extend base test to automatically track page
 export const test = base.extend({
-    page: async ({ page }, use) => {
-        sessionManager.setPage(page);
-        await use(page);
-    },
+  page: async ({ page }, use) => {
+    sessionManager.setPage(page);
+    await use(page);
+  }
 });
 
 // Initialize the LangChain agent with more detailed instructions
 const initializeAgent = () => {
-    const model = createLLMModel();
+  const model = createLLMModel();
 
-    const prompt = `You are a web automation assistant. When given a natural language instruction:
+  const prompt = `You are a web automation assistant. When given a natural language instruction:
         - Always call the snapshot tool first to analyze the page structure and elements, so you can understand the context ad the elements available on the page to perform the requested action
         - For "get" or "get text" instructions, use the getText tool to retrieve content
         - For "click" instructions, use the click tool to interact with elements
@@ -51,79 +62,101 @@ const initializeAgent = () => {
         - For pressing keys, use the pressKey tool
         - For saving PDFs, use the savePDF tool
         - For choosing files, use the chooseFile tool
-        - For verification and assertions, use the assert tool
-        Return the operation result or content as requested.`;
+        - While calling the verification and assertion tools, DO NOT assume or make up any expected values. Use the values as provided in the instruction only.
+        - For verification and assertions like {"isVisible", "hasText", "isEnabled", "isChecked"}, use the browser_assert tool
+        - For page assertions like {page title, current page url} use the browser_page_assert tools
+        Return a stringified JSON object with exactly these fields:
+            {
+                "action": "<type of action performed>",
+                "error": "<error message or empty string>",
+                "output": "<your output message>"
+            }`;
 
-    const agent = createReactAgent({
-        llm: model,
-        tools: [
-            browser_click,
-            browser_type,
-            browser_get_text,
-            browser_navigate,
-            browser_snapshot,
-            browser_hover,
-            browser_drag,
-            browser_select_option,
-            browser_take_screenshot,
-            browser_go_back,
-            browser_wait,
-            browser_press_key,
-            browser_save_pdf,
-            browser_choose_file,
-            browser_assert,
-            browser_go_forward,
-        ],
-        stateModifier: prompt,
-    });
+  const agent = createReactAgent({
+    llm: model,
+    tools: [
+      browser_click,
+      browser_type,
+      browser_get_text,
+      browser_navigate,
+      browser_snapshot,
+      browser_hover,
+      browser_drag,
+      browser_select_option,
+      browser_take_screenshot,
+      browser_go_back,
+      browser_wait,
+      browser_press_key,
+      browser_save_pdf,
+      browser_choose_file,
+      browser_assert,
+      browser_go_forward,
+      browser_page_assert
+    ],
+    stateModifier: prompt,
+    responseFormat: {
+      prompt: `Return a stringified JSON object with exactly these fields:
+            {
+                "action": "<type of action performed>",
+                "error": "<error message or empty string>",
+                "output": "<your output message>"
+            }`,
+      schema: AutoResponseSchema
+    }
+  });
 
-    return { agent };
+  return { agent };
 };
 
 // Main auto function that processes instructions
 export async function auto(
-    instruction: string,
-    config?: AutoConfig,
+  instruction: string,
+  config?: AutoConfig
 ): Promise<any> {
-    console.log(`[Auto] Processing instruction: "${instruction}"`);
+  console.log(`[Auto] Processing instruction: "${instruction}"`);
 
-    if (config?.page) {
-        sessionManager.setPage(config.page);
-        console.log(`[Auto] Page set from config`);
-    } else {
-        try {
-            sessionManager.getPage();
-        } catch {
-            // In standalone mode, create a new page
-            console.log(`[Auto] No existing page, creating new page`);
-            await context.createPage();
-        }
+  if (config?.page) {
+    sessionManager.setPage(config.page);
+    console.log(`[Auto] Page set from config`);
+  } else {
+    try {
+      sessionManager.getPage();
+    } catch {
+      // In standalone mode, create a new page
+      console.log(`[Auto] No existing page, creating new page`);
+      await context.createPage();
     }
+  }
 
-    // Create and invoke the agent
-    console.log(`[Auto] Creating agent for instruction`);
-    const { agent } = initializeAgent();
-    const result = await agent.invoke({
-        messages: [new HumanMessage(instruction)],
-    });
+  // Create and invoke the agent
+  console.log(`[Auto] Creating agent for instruction`);
+  const { agent } = initializeAgent();
+  const response = await agent.invoke({
+    messages: [new HumanMessage(instruction)]
+  });
+  const result = response.structuredResponse;
+  // Process agent result
+  try {
+    console.log(`[Auto] Agent response:`, result);
 
-    console.log('Agent result:', result);
-    // Process agent result
-    const response = result.messages?.[-1]?.content;
-    console.log(`[Auto] Agent response:`, response);
+    // Parse and validate the response
+    const validatedResponse = AutoResponseSchema.parse(result);
 
-    if (typeof response === 'string') {
-        // If it's a success message, return null to match original behavior
-        if (response.startsWith('Successfully')) {
-            console.log(`[Auto] Detected success message, returning null`);
-            return null;
-        }
-        console.log(`[Auto] Returning response string`);
-        return response;
+    console.log(`[Auto] Action: ${validatedResponse.action}`);
+    if (validatedResponse.error) {
+      console.log(`[Auto] Error: ${validatedResponse.error}`);
+      throw {
+        error: validatedResponse.error,
+        output: validatedResponse.output
+      };
     }
 
-    console.log(`[Auto] No string response, returning null`);
-    return null;
+    // Return the output or null if successful with no output
+    return validatedResponse.output || null;
+  } catch (error) {
+    console.log(`[Auto] Error processing response:`, error);
+    throw error;
+  }
 }
 
 // Export everything needed for the package