html agent: add self reflection to model, logic to parse answers from strucutred outputs

bezbahen0 · bezbahen0 · commit 44688c5a113b · 2024-12-28T19:40:13.000+03:00
diff --git a/server/agent/html_agent.py b/server/agent/html_agent.py
@@ -6,14 +6,15 @@
 
 from server.partition import get_processor
 from server.partition.html_processor import HTMLProcessingSettings
-
+from server.model import JsonFieldStreamProcessor
 
 SYSTEM_PROMPT = """You are an intelligent browser assistant that helps users analyze and work with content from the currently active browser tab. Your main tasks are:
 
-1. Understand and process content only from the current active tab (HTML, PDF, plain text)
-2. Provide relevant information and answers based on the given context
-3. Help users find specific information within the current page
-4. Generate summaries, explanations, or analyses as requested
+1. Reflect on the information you have and what answers you will give to the question
+2. Understand and process content only from the current active tab (HTML, PDF, plain text)
+3. Provide relevant information and answers based on the given context
+4. Help users find specific information within the current page
+5. Generate summaries, explanations, or analyses as requested
 
 Important rules:
 - Always respond in the same language the user's question is asked in
@@ -29,16 +30,18 @@
 
 CHUNK_PROCESSING_PROMPT = """You are processing a part of a webpage. Your task is to:
 
-1. Extract only relevant information from this chunk that relates to the user's question
-2. Provide a focused, self-contained response about this specific part
-3. Consider previous findings when analyzing new information
-4. Keep the response concise and factual
-5. Format the response so it can be easily combined with other parts
+1. Reflect on the information you have and what answers you will give to the question
+2. Extract only relevant information from this chunk that relates to the user's question
+3. Provide a focused, self-contained response about this specific part
+4. Consider previous findings when analyzing new information
+5. Keep the response concise and factual
+6. Format the response so it can be easily combined with other parts
 
 Remember:
 - This is part of an iterative analysis process
 - Focus on new relevant information in this chunk
 - Avoid repeating information already found in previous parts
+- Always respond in the same language the user's question is asked in
 - Maintain the user's original language in the response
 - If you find information that complements or contradicts previous findings, note this
 
@@ -47,15 +50,15 @@
 
 
 class AnswerGeneratorWithRelevanceScore(BaseModel):
-    reflection: str
+    reflections: str
     answer: str
     answer_relevance_score_to_question: float = Field(
         default=None, description="Relevance to the question (0-1)"
     )
 
 
 class AnswerGenerator(BaseModel):
-    reflection: str
+    reflections: str
     answer: str
 
 
@@ -69,6 +72,7 @@ def __init__(
         self.client = llm_client
 
         self.content_processor = get_processor()
+        self.answer_processor = JsonFieldStreamProcessor(field_name="answer")
 
     def get_relevant_info(
         self, question, dialog_history, context, url, processing_settings
@@ -79,7 +83,7 @@ def get_relevant_info(
         messages += [
             {
                 "role": "user" if conv.role == "user" else "assistant",
-                "content": conv.message,
+                "content": f"{conv.message} Page Url: ```{conv.url}```",
             }
             for conv in dialog_history
         ]
@@ -128,7 +132,8 @@ def get_relevant_info(
                 messages_parting += [
                     {
                         "role": "user",
-                        "content": f"{question} \n\n Page Url: ```{url}``` \n\nPart of web page \n\n {doc} \n\nYour response format: {AnswerGeneratorWithRelevanceScore.model_json_schema()}",
+                        "content": f"{question} \n\n Page Url: ```{url}``` \n\nPart of web page \n\n {doc} \n\n"
+                        + f"Your response format: {AnswerGeneratorWithRelevanceScore.model_json_schema()}",
                     },
                 ]
 
@@ -147,31 +152,38 @@ def get_relevant_info(
                     "content": f"My question: {question} \n\n   {selected_content}. The content has already been submitted part by part here are the answers to my question in parts with reflection: \n\n```{self.content_processor.make_page(documents, relevant_chunks, processing_settings)}```",
                 },
             ]
-            response_from_model = self.client.generate(messages, stream=True)
+            response_from_model = self.client.generate(
+                messages,
+                stream=True,
+                schema=AnswerGenerator.model_json_schema(),
+                stream_processor=self.answer_processor,
+            )
         else:
             print("\n\nSINGLE RUN\n\n")
             print(str(documents))
 
             messages += [
                 {
                     "role": "user",
-                    "content": f"{question} \n\n Page url: ```{url}```\n\n {selected_content} \n\n ```{str(documents)}```",
-                    # Your response format: {AnswerGenerator.model_json_schema()}",
+                    "content": f"Question: {question} \n\n Page url: ```{url}```\n\n {selected_content} \n\n ```{str(documents)}```"
+                    + f"\nYour response format: {AnswerGenerator.model_json_schema()}",
                 },
             ]
             response_from_model = self.client.generate(
                 messages,
-                stream=True,  
-                # schema=AnswerGenerator.model_json_schema()
+                stream=True,
+                schema=AnswerGenerator.model_json_schema(),
+                stream_processor=self.answer_processor,
             )
         return response_from_model
 
     def generate_chat_response(self, dialog_history):
         messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-        for conv in dialog_history:
-            if conv.role == "user":
-                messages.append({"role": "user", "content": conv.message})
-            else:
-                messages.append({"role": "assistant", "content": conv.message})
-
+        messages += [
+            {
+                "role": "user" if conv.role == "user" else "assistant",
+                "content": f"{conv.message} Page Url: ```{conv.url}```",
+            }
+            for conv in dialog_history
+        ]
         return self.client.generate(messages, stream=True)
diff --git a/server/model.py b/server/model.py
@@ -4,6 +4,71 @@
 from tqdm import tqdm
 
 
+from abc import ABC, abstractmethod
+from typing import Generator, Any, Optional, Callable
+
+
+class StreamProcessor(ABC):
+    @abstractmethod
+    def process_stream(self, stream: Generator) -> Generator:
+        pass
+
+
+class JsonFieldStreamProcessor(StreamProcessor):
+    def __init__(self, field_name: str):
+        self.field_name = field_name
+        self.buffer = ""
+        self.in_field = False
+        self.json_started = False
+
+    def process_stream(self, stream: Generator) -> Generator:
+        for token in stream:
+            content = token["choices"][0]["delta"].get("content", "")
+            if not content:
+                continue
+
+            self.buffer += content
+
+            if not self.json_started and "{" in self.buffer:
+                self.json_started = True
+
+            if not self.json_started:
+                continue
+
+            field_marker = f'"{self.field_name}": "'
+            if field_marker in self.buffer and not self.in_field:
+                self.in_field = True
+                self.buffer = self.buffer.split(field_marker)[1]
+
+            if self.in_field:
+                index = 0
+                while index < len(self.buffer):
+                    if self.buffer[index] == '"':
+                        if index > 0 and self.buffer[index - 1] == '\\':
+                            index += 1
+                            continue
+                        
+                        field_content = self.buffer[:index]
+                        self.buffer = self.buffer[index + 1:]
+                        
+                        if field_content:
+                            yield field_content
+                        
+                        self.in_field = False
+                        break
+                    index += 1
+                
+                if self.in_field and self.buffer:
+                    yield self.buffer
+                    self.buffer = ""
+
+
+class DefaultStreamProcessor(StreamProcessor):
+    def process_stream(self, stream: Generator) -> Generator:
+        for token in stream:
+            yield token["choices"][0]["delta"].get("content", "")
+
+
 class LlamaCppWrapper:
     def __init__(
         self,
@@ -50,7 +115,13 @@ def get_params(self):
     def tokenize(self, text):
         return self.model.tokenize(text.encode("utf8"))
 
-    def generate(self, template, stream=False, schema=None):
+    def generate(
+        self,
+        template,
+        stream=False,
+        schema=None,
+        stream_processor=None,
+    ):
         if schema:
             response_generator = self.model.create_chat_completion(
                 template,
@@ -70,8 +141,10 @@ def generate(self, template, stream=False, schema=None):
         if stream:
 
             def generate():
-                for token in response_generator:
-                    yield token["choices"][0]["delta"].get("content", "")
+                processor = (
+                    stream_processor if stream_processor else DefaultStreamProcessor()
+                )
+                return processor.process_stream(response_generator)
 
             return generate()
         return response_generator["choices"][0]["message"]["content"]