Skip to content

Commit d70f8e2

Browse files
authored
Merge branch 'main' into functionary-stream
2 parents 23177a3 + f9b7221 commit d70f8e2

File tree

7 files changed

+94
-8
lines changed

7 files changed

+94
-8
lines changed

CHANGELOG.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.2.69]
11+
12+
- feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
13+
- feat: Add llama-3-vision-alpha chat format by @abetlen in 31b1d95a6c19f5b615a3286069f181a415f872e8
14+
- fix: Change default verbose value of verbose in image chat format handlers to True to match Llama by @abetlen in 4f01c452b6c738dc56eacac3758119b12c57ea94
15+
- fix: Suppress all logs when verbose=False, use hardcoded fileno's to work in colab notebooks by @abetlen in f116175a5a7c84569c88cad231855c1e6e59ff6e
16+
- fix: UTF-8 handling with grammars by @jsoma in #1415
17+
1018
## [0.2.68]
1119

12-
- feat: Update llama.cpp to ggerganov/llama.cpp@
20+
- feat: Update llama.cpp to ggerganov/llama.cpp@77e15bec6217a39be59b9cc83d6b9afb6b0d8167
1321
- feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
1422
- fix(ci): Fix build-and-release.yaml by @Smartappli in #1413
1523

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.68"
4+
__version__ = "0.2.69"

llama_cpp/llama.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,12 @@ def __init__(
262262
raise ValueError(f"Value for {k} is too long: {v}")
263263
v_bytes = v_bytes.ljust(128, b"\0")
264264
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
265-
self._kv_overrides_array[i].value.str_value[:128] = v_bytes
265+
# copy min(v_bytes, 128) to str_value
266+
ctypes.memmove(
267+
self._kv_overrides_array[i].value.str_value,
268+
v_bytes,
269+
min(len(v_bytes), 128),
270+
)
266271
else:
267272
raise ValueError(f"Unknown value type for {k}: {v}")
268273

llama_cpp/llama_chat_format.py

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2475,7 +2475,7 @@ def generate_streaming(tools, functions, function_call, prompt):
24752475

24762476

24772477
class Llava15ChatHandler:
2478-
DEFAULT_SYSTEM_MESSAGE = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
2478+
DEFAULT_SYSTEM_MESSAGE: Optional[str] = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
24792479

24802480
CHAT_FORMAT = (
24812481
"{% for message in messages %}"
@@ -2598,7 +2598,7 @@ def __call__(
25982598
assert self.clip_ctx is not None
25992599

26002600
system_prompt = _get_system_message(messages)
2601-
if system_prompt == "":
2601+
if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
26022602
messages = [llama_types.ChatCompletionRequestSystemMessage(role="system", content=self.DEFAULT_SYSTEM_MESSAGE)] + messages
26032603

26042604
image_urls = self.get_image_urls(messages)
@@ -3081,6 +3081,66 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
30813081
"{% endif %}"
30823082
)
30833083

3084+
class Llama3VisionAlpha(Llava15ChatHandler):
3085+
# question = "<image>" + q
3086+
3087+
# prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
3088+
DEFAULT_SYSTEM_MESSAGE = None
3089+
3090+
CHAT_FORMAT = (
3091+
"{% for message in messages %}"
3092+
3093+
"<|start_header_id|>"
3094+
3095+
"{% if message.role == 'user' %}"
3096+
3097+
"user<|end_header_id|>\n\n"
3098+
3099+
"{% if message.content is iterable %}"
3100+
3101+
# <image>
3102+
"{% for content in message.content %}"
3103+
"{% if content.type == 'image_url' %}"
3104+
"{% if content.image_url is string %}"
3105+
"{{ content.image_url }}"
3106+
"{% endif %}"
3107+
"{% if content.image_url is mapping %}"
3108+
"{{ content.image_url.url }}"
3109+
"{% endif %}"
3110+
"{% endif %}"
3111+
"{% endfor %}"
3112+
3113+
# Question:
3114+
"{% for content in message.content %}"
3115+
"{% if content.type == 'text' %}"
3116+
"{{ content.text }}"
3117+
"{% endif %}"
3118+
"{% endfor %}"
3119+
3120+
"{% endif %}"
3121+
3122+
# Question:
3123+
"{% if message.content is string %}"
3124+
"{{ message.content }}"
3125+
"{% endif %}"
3126+
3127+
"{% endif %}"
3128+
3129+
# Answer:
3130+
"{% if message.role == 'assistant' %}"
3131+
"assistant<|end_header_id|>\n\n"
3132+
"{{ message.content }}"
3133+
"{% endif %}"
3134+
3135+
"<|eot_id|>"
3136+
3137+
"{% endfor %}"
3138+
3139+
# Generation prompt
3140+
"{% if add_generation_prompt %}"
3141+
"<|start_header_id|>assistant<|end_header_id|>\n\n"
3142+
"{% endif %}"
3143+
)
30843144

30853145
@register_chat_completion_handler("chatml-function-calling")
30863146
def chatml_function_calling(

llama_cpp/server/model.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
140140
chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
141141
clip_model_path=settings.clip_model_path, verbose=settings.verbose
142142
)
143+
elif settings.chat_format == "llama-3-vision-alpha":
144+
assert settings.clip_model_path is not None, "clip model not found"
145+
if settings.hf_model_repo_id is not None:
146+
chat_handler = (
147+
llama_cpp.llama_chat_format.Llama3VisionAlpha.from_pretrained(
148+
repo_id=settings.hf_model_repo_id,
149+
filename=settings.clip_model_path,
150+
verbose=settings.verbose,
151+
)
152+
)
153+
else:
154+
chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha(
155+
clip_model_path=settings.clip_model_path, verbose=settings.verbose
156+
)
143157
elif settings.chat_format == "hf-autotokenizer":
144158
assert (
145159
settings.hf_pretrained_model_name_or_path is not None
@@ -228,6 +242,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
228242
logits_all=settings.logits_all,
229243
embedding=settings.embedding,
230244
offload_kqv=settings.offload_kqv,
245+
flash_attn=settings.flash_attn,
231246
# Sampling Params
232247
last_n_tokens_size=settings.last_n_tokens_size,
233248
# LoRA Params

llama_cpp/server/types.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818

1919
temperature_field = Field(
2020
default=0.8,
21-
ge=0.0,
22-
le=2.0,
2321
description="Adjust the randomness of the generated text.\n\n"
2422
+ "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
2523
)

vendor/llama.cpp

0 commit comments

Comments
 (0)