oobabooga
diff --git a/‎examples/gradio_chat/local.py
+1-2 b/‎examples/gradio_chat/local.py
+1-2
diff --git a/‎examples/gradio_chat/server.py
-1 b/‎examples/gradio_chat/server.py
-1
diff --git a/‎examples/hf_pull/main.py
-1 b/‎examples/hf_pull/main.py
-1
diff --git a/‎examples/high_level_api/fastapi_server.py
+1-1 b/‎examples/high_level_api/fastapi_server.py
+1-1
diff --git a/‎examples/high_level_api/high_level_api_inference.py
+1-1 b/‎examples/high_level_api/high_level_api_inference.py
+1-1
diff --git a/‎examples/high_level_api/high_level_api_streaming.py
+1-1 b/‎examples/high_level_api/high_level_api_streaming.py
+1-1
diff --git a/‎examples/high_level_api/langchain_custom_llm.py
+3-4 b/‎examples/high_level_api/langchain_custom_llm.py
+3-4
diff --git a/‎examples/low_level_api/Chat.py
+4-1 b/‎examples/low_level_api/Chat.py
+4-1
diff --git a/‎examples/low_level_api/Miku.py
+3-1 b/‎examples/low_level_api/Miku.py
+3-1
diff --git a/‎examples/low_level_api/ReasonAct.py
+4-1 b/‎examples/low_level_api/ReasonAct.py
+4-1
diff --git a/‎examples/low_level_api/common.py
+1-2 b/‎examples/low_level_api/common.py
+1-2
diff --git a/‎examples/low_level_api/low_level_api_chat_cpp.py
+2-2 b/‎examples/low_level_api/low_level_api_chat_cpp.py
+2-2
diff --git a/‎examples/low_level_api/low_level_api_llama_cpp.py
+1-1 b/‎examples/low_level_api/low_level_api_llama_cpp.py
+1-1
diff --git a/‎examples/low_level_api/quantize.py
+2-1 b/‎examples/low_level_api/quantize.py
+2-1
diff --git a/‎examples/ray/llm.py
+4-2 b/‎examples/ray/llm.py
+4-2
diff --git a/‎llama_cpp/__init__.py
+1-1 b/‎llama_cpp/__init__.py
+1-1
diff --git a/‎llama_cpp/_ctypes_extensions.py
+6-6 b/‎llama_cpp/_ctypes_extensions.py
+6-6
diff --git a/‎llama_cpp/_ggml.py
-1 b/‎llama_cpp/_ggml.py
-1
diff --git a/‎llama_cpp/_internals.py
+9-16 b/‎llama_cpp/_internals.py
+9-16
diff --git a/‎llama_cpp/_logger.py
+3-1 b/‎llama_cpp/_logger.py
+3-1
diff --git a/‎llama_cpp/_utils.py
-1 b/‎llama_cpp/_utils.py
-1
@@ -1,8 +1,7 @@
+import gradio as gr
 import llama_cpp
 import llama_cpp.llama_tokenizer
 
-import gradio as gr
-
 llama = llama_cpp.Llama.from_pretrained(
     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
     filename="*q8_0.gguf",
 
@@ -1,5 +1,4 @@
 import gradio as gr
-
 from openai import OpenAI
 
 client = OpenAI(base_url="http://localhost:8000/v1", api_key="llama.cpp")
 
@@ -1,7 +1,6 @@
 import llama_cpp
 import llama_cpp.llama_tokenizer
 
-
 llama = llama_cpp.Llama.from_pretrained(
     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
     filename="*q8_0.gguf",
 
@@ -26,8 +26,8 @@
 """
 
 import os
-import uvicorn
 
+import uvicorn
 from llama_cpp.server.app import create_app
 
 if __name__ == "__main__":
 
@@ -1,5 +1,5 @@
-import json
 import argparse
+import json
 
 from llama_cpp import Llama
 
 
@@ -1,5 +1,5 @@
-import json
 import argparse
+import json
 
 from llama_cpp import Llama
 
 
@@ -1,9 +1,8 @@
 import argparse
-
-from llama_cpp import Llama
+from typing import Any, List, Mapping, Optional
 
 from langchain.llms.base import LLM
-from typing import Optional, List, Mapping, Any
+from llama_cpp import Llama
 
 
 class LlamaLLM(LLM):
@@ -41,9 +40,9 @@ def _identifying_params(self) -> Mapping[str, Any]:
 )
 print(f"Answer: {answer.strip()}")
 
+from langchain.chains import LLMChain
 # Using in a chain
 from langchain.prompts import PromptTemplate
-from langchain.chains import LLMChain
 
 prompt = PromptTemplate(
     input_variables=["product"],
 
@@ -1,5 +1,8 @@
 #!/bin/python
-import sys, os, datetime
+import datetime
+import os
+import sys
+
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 
 
@@ -1,5 +1,7 @@
 #!/bin/python
-import sys, os
+import os
+import sys
+
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 
 
@@ -1,5 +1,8 @@
 #!/bin/python
-import sys, os, datetime
+import datetime
+import os
+import sys
+
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 
 
@@ -1,7 +1,6 @@
-import os
 import argparse
+import os
 import re
-
 from dataclasses import dataclass, field
 from typing import List
 
 
@@ -13,12 +13,12 @@
 
 import ctypes
 import sys
-from time import time
 from os import cpu_count, path
+from time import time
 
 import llama_cpp
-from common import GptParams, gpt_params_parse, gpt_random_prompt
 import util
+from common import GptParams, gpt_params_parse, gpt_random_prompt
 
 
 # A LLaMA interactive session
 
@@ -1,6 +1,6 @@
 import ctypes
-import os
 import multiprocessing
+import os
 
 import llama_cpp
 
 
@@ -1,5 +1,6 @@
-import os
 import argparse
+import os
+
 import llama_cpp
 
 
 
@@ -1,8 +1,10 @@
-from starlette.requests import Request
 from typing import Dict
+
+from llama_cpp import Llama
+from starlette.requests import Request
+
 from ray import serve
 from ray.serve import Application
-from llama_cpp import Llama
 
 
 @serve.deployment
 
@@ -1,4 +1,4 @@
-from .llama_cpp import *
 from .llama import *
+from .llama_cpp import *
 
 __version__ = "0.3.5"
@@ -1,21 +1,21 @@
 from __future__ import annotations
 
-import sys
-import os
 import ctypes
 import functools
+import os
 import pathlib
-
+import sys
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
+    Generic,
     List,
-    Union,
     Optional,
-    TYPE_CHECKING,
     TypeVar,
-    Generic,
+    Union
 )
+
 from typing_extensions import TypeAlias
 
 
 
@@ -9,4 +9,3 @@
 
 libggml_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
 libggml = ctypes_ext.load_shared_library("ggml", libggml_base_path)
-
@@ -1,27 +1,19 @@
 from __future__ import annotations
 
-import os
 import ctypes
-
-from typing import (
-    Dict,
-    List,
-    Tuple,
-    Optional,
-    Sequence,
-)
-from dataclasses import dataclass, field
+import os
 from contextlib import ExitStack
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Sequence, Tuple
 
 import numpy as np
 import numpy.typing as npt
 
-from .llama_types import *
-from .llama_grammar import LlamaGrammar
-from ._utils import suppress_stdout_stderr
-
 import llama_cpp.llama_cpp as llama_cpp
 
+from ._utils import suppress_stdout_stderr
+from .llama_grammar import LlamaGrammar
+from .llama_types import *
 
 # Python wrappers over llama.h structs
 
@@ -631,7 +623,7 @@ def sample(
         if len(self.prev) > 0:
             nl_token = ctx_main.model.token_nl()
             nl_logit = logits_array[nl_token]
-            last_tokens = self.prev[-self.params.penalty_last_n :]
+            last_tokens = self.prev[-self.params.penalty_last_n:]
             last_tokens_size = min(len(last_tokens), self.params.penalty_last_n)
             if last_tokens_size > 0:
                 last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens)
@@ -697,8 +689,9 @@ def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool):
         self.prev.append(id)
 
 
-from typing import List, Callable, Optional, Union
 import ctypes
+from typing import Callable, List, Optional, Union
+
 import llama_cpp
 
 
 
@@ -1,6 +1,6 @@
-import sys
 import ctypes
 import logging
+import sys
 
 import llama_cpp
 
@@ -26,6 +26,8 @@
 _last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0]
 
 # typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
+
+
 @llama_cpp.llama_log_callback
 def llama_log_callback(
     level: int,
 
@@ -1,6 +1,5 @@
 import os
 import sys
-
 from typing import Any, Dict
 
 # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`import gradio as gr`
`2`		`-`
`3`	`2`	`from openai import OpenAI`
`4`	`3`
`5`	`4`	`client = OpenAI(base_url="http://localhost:8000/v1", api_key="llama.cpp")`
Original file line number	Diff line number	Diff line change
`@@ -9,4 +9,3 @@`
`9`	`9`
`10`	`10`	`libggml_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"`
`11`	`11`	`libggml = ctypes_ext.load_shared_library("ggml", libggml_base_path)`
`12`		`-`