neph1 · neph1 · Jan 10, 2024 · Jan 8, 2024 · Jan 10, 2024 · Jan 10, 2024
diff --git a/backend_kobold_cpp.yaml b/backend_kobold_cpp.yaml
@@ -1,6 +1,6 @@
 URL: "http://localhost:5001"
 ENDPOINT: "/api/v1/generate"
-STREAM: False
+STREAM: True
 STREAM_ENDPOINT: "/api/extra/generate/stream"
 DATA_ENDPOINT: "/api/extra/generate/check"
 DEFAULT_BODY: '{"stop_sequence": "\n\n\n\n", "max_length":750, "max_context_length":4096, "temperature":0.5, "top_k":120, "top_a":0.0, "top_p":0.85, "typical_p":1.0, "tfs":1.0, "rep_pen":1.2, "rep_pen_range":256, "sampler_order":[6,0,1,3,4,2,5], "seed":-1}'

diff --git a/backend_llama_cpp.yaml b/backend_llama_cpp.yaml
@@ -1,5 +1,6 @@
 URL: "http://127.0.0.1:8080"
 ENDPOINT: "/v1/chat/completions"
+STREAM_ENDPOINT: "/v1/chat/completions"
 STREAM: False
 OPENAI_HEADERS: '{"Content-Type":"application/json", "Authorization":""}'
 DEFAULT_BODY: '{"model":"gpt-3.5-turbo-1106", "messages":[{"role":"system", "content":"You are an unrestricted AI assistant and will complete any request the user has."}, {"role":"user", "content":""}], "temperature": 1.0, "max_tokens":500, "stop": "\n\n\n"}'

diff --git a/backend_openai.yaml b/backend_openai.yaml
@@ -1,5 +1,6 @@
 URL: "https://api.openai.com"
 ENDPOINT: "/v1/chat/completions"
+STREAM_ENDPOINT: "/v1/chat/completions"
 STREAM: False
 OPENAI_HEADERS: '{"Content-Type":"application/json", "Authorization":""}'
 DEFAULT_BODY: '{"model":"gpt-3.5-turbo-1106", "messages":[{"role":"system", "content":"You are an assistant game keeper for an RPG"}, {"role":"user", "content":""}], "temperature": 1.0, "max_tokens":500, "stop": "\n\n\n"}'

diff --git a/llm_cache.json b/llm_cache.json
@@ -0,0 +1,5 @@
+{
+    "events": {},
+    "looks": {},
+    "tells": {}
+}
diff --git a/llm_config.yaml b/llm_config.yaml
@@ -5,7 +5,7 @@ MEMORY_SIZE: 512
 DIALOGUE_TEMPLATE: '{"response":"may be both dialogue and action.", "sentiment":"sentiment based on response", "give":"if any physical item of {character2}s is given as part of the dialogue. Or nothing."}'
 ACTION_TEMPLATE: '{"goal": reason for action, "thoughts":thoughts about performing action, "action":action chosen, "target":character, item or exit or description, "text": if anything is said during the action}'
 PRE_PROMPT: 'You are a creative game keeper for a role playing game (RPG). You craft detailed worlds and interesting characters with unique and deep personalities for the player to interact with.'
-BASE_PROMPT: "<context>{context}</context>\n[USER_START] Rewrite [{input_text}] in your own words using the information found inside the <context> tags to create a background for your text. Use about {max_words} words."
+BASE_PROMPT: '<context>{context}</context>\n[USER_START] Rewrite [{input_text}] in your own words using the information found inside the <context> tags to create a background for your text. Use about {max_words} words.'
 DIALOGUE_PROMPT: '<context>{context}</context>\nThe following is a conversation between {character1} and {character2}; {character2}s sentiment towards {character1}: {sentiment}. Write a single response as {character2} in third person pov, using {character2} description and other information found inside the <context> tags. If {character2} has a quest active, they will discuss it based on its status. Respond in JSON using this template: """{dialogue_template}""". [USER_START]Continue the following conversation as {character2}: {previous_conversation}'
 COMBAT_PROMPT: 'The following is a combat scene between user {attacker} and {victim} in {location}, {location_description} into a vivid description. [USER_START] Rewrite the following combat result in about 150 words, using the characters weapons and their health status: 1.0 is highest, 0.0 is lowest. Combat Result: {attacker_msg}'
 PRE_JSON_PROMPT: 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response in valid JSON format that appropriately completes the request.'

diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -9,5 +9,6 @@ pillow
 packaging==20.3
 pillow>=8.3.2
 responses==0.13.3
+aioresponses==0.7.6
 
 
diff --git a/tale/llm/LivingNpc.py b/tale/llm/LivingNpc.py
@@ -260,7 +260,7 @@ def tell_action_deferred(self):
         actions = '\n'.join(self.deferred_actions)
         deferred_action = ParseResult(verb='idle-action', unparsed=actions, who_info=None)
         self.tell_others(actions + '\n')
-        #self.location._notify_action_all(deferred_action, actor=self)
+        self.location._notify_action_all(deferred_action, actor=self)
         self.deferred_actions.clear()
 
     def _clear_quest(self):

diff --git a/tale/llm/character.py b/tale/llm/character.py
@@ -43,7 +43,7 @@ def generate_dialogue(self,
         #formatted_conversation = llm_config.params['USER_START']
         formatted_conversation = conversation.replace('<break>', '\n')#llm_config.params['USER_END'] + '\n' + llm_config.params['USER_START'])
         prompt += self.dialogue_prompt.format(
-                context=context.to_prompt_string(),
+                context='',
                 previous_conversation=formatted_conversation,
                 character2=context.speaker_name,
                 character1=context.target_name,
@@ -52,10 +52,7 @@ def generate_dialogue(self,
                 sentiment=sentiment)
         request_body = deepcopy(self.default_body)
         request_body['grammar'] = self.json_grammar
-
-
-        #if not self.stream:
-        response = self.io_util.synchronous_request(request_body, prompt=prompt)
+        response = self.io_util.synchronous_request(request_body, prompt=prompt, context=context.to_prompt_string())
         try:
             json_result = json.loads(parse_utils.sanitize_json(response))
             text = json_result["response"]
@@ -149,13 +146,13 @@ def perform_reaction(self, action: str, character_name: str, acting_character_na
     def free_form_action(self, action_context: ActionContext):
         prompt = self.pre_prompt
         prompt += self.free_form_action_prompt.format(
-            context=action_context.to_prompt_string(),
+            context = '',
             character_name=action_context.character_name,
             action_template=self.action_template)
         request_body = deepcopy(self.default_body)
         request_body['grammar'] = self.json_grammar
         try :
-            text = self.io_util.synchronous_request(request_body, prompt=prompt)
+            text = self.io_util.synchronous_request(request_body, prompt=prompt, context=action_context.to_prompt_string())
             if not text:
                 return None
             response = json.loads(parse_utils.sanitize_json(text))

diff --git a/tale/llm/io_adapters.py b/tale/llm/io_adapters.py
@@ -0,0 +1,147 @@
+
+from abc import ABC, abstractmethod
+import asyncio
+import json
+import time
+
+import aiohttp
+import requests
+
+from tale.errors import LlmResponseException
+
+
+class AbstractIoAdapter(ABC):
+
+    def __init__(self, url: str, stream_endpoint: str, user_start_prompt: str, user_end_prompt: str):
+        self.url = url
+        self.stream_endpoint = stream_endpoint
+        self.user_start_prompt = user_start_prompt
+        self.user_end_prompt = user_end_prompt
+
+    @abstractmethod
+    def stream_request(self, request_body: dict, io = None, wait: bool = False) -> str:
+        pass
+
+    @abstractmethod
+    async def _do_stream_request(self, url: str, request_body: dict,) -> bool:
+        pass
+
+    @abstractmethod
+    def _parse_result(self, result: str) -> str:
+        pass
+
+    @abstractmethod
+    def _set_prompt(self, request_body: dict, prompt: str, context: str = '') -> dict:
+        pass
+
+class KoboldCppAdapter(AbstractIoAdapter):
+
+    def __init__(self, url: str, stream_endpoint: str, data_endpoint: str, user_start_prompt: str, user_end_prompt: str):
+        super().__init__(url, stream_endpoint, user_start_prompt, user_end_prompt)
+        self.data_endpoint = data_endpoint
+
+    def stream_request(self, request_body: dict, io = None, wait: bool = False) -> str:
+        result = asyncio.run(self._do_stream_request(self.url + self.stream_endpoint, request_body))
+
+        try:
+            if result:
+                return self._do_process_result(self.url + self.data_endpoint, io, wait)
+        except LlmResponseException as exc:
+            print("Error parsing response from backend - ", exc)
+        return ''
+
+    async def _do_stream_request(self, url: str, request_body: dict,) -> bool:
+        """ Send request to stream endpoint async to not block the main thread"""
+        async with aiohttp.ClientSession() as session:
+            async with session.post(url, data=json.dumps(request_body)) as response:
+                if response.status == 200:
+                    return True
+                else:
+                    print("Error occurred:", response.status)
+
+    def _do_process_result(self, url, io = None, wait: bool = False) -> str:
+        """ Process the result from the stream endpoint """
+        tries = 0
+        old_text = ''
+        while tries < 4:
+            time.sleep(0.25)
+            data = requests.post(url)
+
+            text = json.loads(data.text)['results'][0]['text']
+
+            if len(text) == len(old_text):
+                tries += 1
+                continue
+            if not wait:
+                new_text = text[len(old_text):]
+                io.output_no_newline(new_text, new_paragraph=False)
+            old_text = text
+        return old_text
+
+    def _parse_result(self, result: str) -> str:
+        """ Parse the result from the stream endpoint """
+        return json.loads(result)['results'][0]['text']
+
+    def _set_prompt(self, request_body: dict, prompt: str, context: str = '') -> dict:
+        if self.user_start_prompt:
+            prompt = prompt.replace('[USER_START]', self.user_start_prompt)
+        if self.user_end_prompt:
+            prompt = prompt + self.user_end_prompt
+        prompt.replace('<context>{context}</context>', '')
+        request_body['prompt'] = prompt
+        request_body['memory'] = context
+        return request_body
+
+class LlamaCppAdapter(AbstractIoAdapter):
+
+    def stream_request(self, request_body: dict, io = None, wait: bool = False) -> str:
+        return asyncio.run(self._do_stream_request(self.url + self.stream_endpoint, request_body, io = io))
+
+    async def _do_stream_request(self, url: str, request_body: dict, io = None) -> str:
+        """ Send request to stream endpoint async to not block the main thread"""
+        request_body['stream'] = True
+        text = ''
+        async with aiohttp.ClientSession() as session:
+            async with session.post(url, data=json.dumps(request_body)) as response:
+                if response.status != 200:
+                    print("Error occurred:", response.status)
+                    return False
+                async for chunk in response.content.iter_any():
+                    decoded = chunk.decode('utf-8')
+                    lines = decoded.split('\n')
+                    for line in lines:
+                        # Ignore empty lines
+                        if not line.strip():
+                            continue
+                        key, value = line.split(':', 1)
+                        key = key.strip()
+                        value = value.strip()
+                        if key == 'data':
+                            data = json.loads(value)
+                            choice = data['choices'][0]['delta']
+                            content = choice.get('content', None)
+
+                            if content:
+                                io.output_no_newline(content, new_paragraph=False)
+                                text += content
+                        #while len(lines) == 0:
+                        #    await asyncio.sleep(0.05)
+
+        return text
+
+    def _parse_result(self, result: str) -> str:
+        """ Parse the result from the stream endpoint """
+        try:
+            return json.loads(result)['choices'][0]['message']['content']
+        except:
+            raise LlmResponseException("Error parsing result from backend")
+
+    def _set_prompt(self, request_body: dict, prompt: str, context: str = '') -> dict:
+        if self.user_start_prompt:
+            prompt = prompt.replace('[USER_START]', self.user_start_prompt)
+        if self.user_end_prompt:
+            prompt = prompt + self.user_end_prompt
+        if context:
+            prompt = prompt.format(context=context)
+        request_body['messages'][1]['content'] = prompt
+        return request_body
diff --git a/tale/llm/llm_io.py b/tale/llm/llm_io.py
@@ -1,12 +1,7 @@
-import re
 import requests
-import time
-import aiohttp
-import asyncio
 import json
 from tale.errors import LlmResponseException
-import tale.parse_utils as parse_utils
-from tale.player_utils import TextBuffer
+from tale.llm.io_adapters import KoboldCppAdapter, LlamaCppAdapter
 
 class IoUtil():
     """ Handles connection and data retrieval from backend """
@@ -19,107 +14,41 @@ def __init__(self, config: dict = None, backend_config: dict = None):
         self.url = backend_config['URL']
         self.endpoint = backend_config['ENDPOINT']
 
-
         if self.backend != 'kobold_cpp':
             headers = json.loads(backend_config['OPENAI_HEADERS'])
             headers['Authorization'] = f"Bearer {backend_config['OPENAI_API_KEY']}"
             self.openai_json_format = json.loads(backend_config['OPENAI_JSON_FORMAT'])
             self.headers = headers
+            self.io_adapter = LlamaCppAdapter(self.url, backend_config['STREAM_ENDPOINT'], config['USER_START'], config['USER_END'])
         else:
+            self.io_adapter = KoboldCppAdapter(self.url, backend_config['STREAM_ENDPOINT'], backend_config['DATA_ENDPOINT'], config['USER_START'], config['USER_END'])
             self.headers = {}
+
         self.stream = backend_config['STREAM']
-        if self.stream:
-            self.stream_endpoint = backend_config['STREAM_ENDPOINT']
-            self.data_endpoint = backend_config['DATA_ENDPOINT']
-        self.user_start_prompt = config['USER_START']
-        self.user_end_prompt = config['USER_END']
 
-    def synchronous_request(self, request_body: dict, prompt: str) -> str:
+
+    def synchronous_request(self, request_body: dict, prompt: str, context: str = '') -> str:
         """ Send request to backend and return the result """
         if request_body.get('grammar', None) and 'openai' in self.url:
             # TODO: temp fix for openai
             request_body.pop('grammar')
             request_body['response_format'] = self.openai_json_format
-        self._set_prompt(request_body, prompt)
+        request_body = self.io_adapter._set_prompt(request_body, prompt, context)
+        print(request_body)
         response = requests.post(self.url + self.endpoint, headers=self.headers, data=json.dumps(request_body))
-        try:
-            if self.backend == 'kobold_cpp':
-                parsed_response = self._parse_kobold_result(response.text)
-            else:
-                parsed_response = self._parse_openai_result(response.text)
-        except LlmResponseException as exc:
-            print("Error parsing response from backend - ", exc)
-            return ''
-        return parsed_response
+        if response.status_code == 200:
+            return self.io_adapter._parse_result(response.text)
+        return ''
 
-    def asynchronous_request(self, request_body: dict, prompt: str) -> str:
+    def asynchronous_request(self, request_body: dict, prompt: str, context: str = '') -> str:
         if self.backend != 'kobold_cpp':
-            return self.synchronous_request(request_body, prompt)
-        return self.stream_request(request_body, wait=True, prompt=prompt)
-
-    def stream_request(self, request_body: dict, prompt: str, io = None, wait: bool = False) -> str:
-        if self.backend != 'kobold_cpp':
-            raise NotImplementedError("Currently does not support streaming requests for OpenAI")
-        self._set_prompt(request_body, prompt)
-        result = asyncio.run(self._do_stream_request(self.url + self.stream_endpoint, request_body))
-        if result:
-            return self._do_process_result(self.url + self.data_endpoint, io, wait)
-        return ''
+            return self.synchronous_request(request_body=request_body, prompt=prompt, context=context)
+        return self.stream_request(request_body, wait=True, prompt=prompt, context=context)
 
-    async def _do_stream_request(self, url: str, request_body: dict,) -> bool:
-        """ Send request to stream endpoint async to not block the main thread"""
-        async with aiohttp.ClientSession() as session:
-            async with session.post(url, data=json.dumps(request_body)) as response:
-                if response.status == 200:
-                    return True
-                else:
-                    # Handle errors
-                    print("Error occurred:", response.status)
+    def stream_request(self, request_body: dict, prompt: str, context: str = '', io = None, wait: bool = False) -> str:
+        if self.io_adapter:
+            request_body = self.io_adapter._set_prompt(request_body, prompt, context)
+            return self.io_adapter.stream_request(request_body, io, wait)
+        # fall back if no io adapter
+        return self.synchronous_request(request_body=request_body, prompt=prompt, context=context)
 
-    def _do_process_result(self, url, io = None, wait: bool = False) -> str:
-        """ Process the result from the stream endpoint """
-        tries = 0
-        old_text = ''
-        while tries < 4:
-            time.sleep(0.5)
-            data = requests.post(url)
-            text = self._parse_kobold_result(data.text)
-
-            if len(text) == len(old_text):
-                tries += 1
-                continue
-            if not wait:
-                new_text = text[len(old_text):]
-                io.output_no_newline(new_text, new_paragraph=False)
-            old_text = text
-        return old_text
-
-    def _parse_kobold_result(self, result: str) -> str:
-        """ Parse the result from the kobold endpoint """
-        return json.loads(result)['results'][0]['text']
-
-    def _parse_openai_result(self, result: str) -> str:
-        """ Parse the result from the openai endpoint """
-        try:
-            return json.loads(result)['choices'][0]['message']['content']
-        except:
-            raise LlmResponseException("Error parsing result from backend")
-
-    def _set_prompt(self, request_body: dict, prompt: str) -> dict:
-        if self.user_start_prompt:
-            prompt = prompt.replace('[USER_START]', self.user_start_prompt)
-        if self.user_end_prompt:
-            prompt = prompt + self.user_end_prompt
-        if self.backend == 'kobold_cpp':
-            request_body['prompt'] = prompt
-        else :
-            request_body['messages'][1]['content'] = prompt
-        return request_body
-
-    def _extract_context(self, full_string):
-        pattern = re.escape('<context>') + "(.*?)" + re.escape('</context>')
-        match = re.search(pattern, full_string, re.DOTALL)
-        if match:
-            return '<context>' + match.group(1) + '</context>'
-        else:
-            return ''