Skip to content

Commit 1110ae8

Browse files
authored
Merge branch 'abetlen:main' into min_tokens
2 parents 466c9aa + a420f96 commit 1110ae8

File tree

10 files changed

+504
-60
lines changed

10 files changed

+504
-60
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.2.61]
11+
12+
- feat: Update llama.cpp to ggerganov/llama.cpp@ba5e134e073ec6837078c874aba44a702944a676
13+
- fix: pass correct type to chat handlers for chat completion logprobs by @abetlen in bb65b4d76411112c6fb0bf759efd746f99ef3c6b
14+
- feat: Add support for yaml based server configs by @abetlen in 060bfa64d529ade2af9b1f4e207a3937bbc4138f
15+
- feat: Add typechecking for ctypes structure attributes by @abetlen in 1347e1d050fc5a9a32ffe0bb3e22858da28003bd
16+
1017
## [0.2.60]
1118

1219
- feat: Update llama.cpp to ggerganov/llama.cpp@75cd4c77292034ecec587ecb401366f57338f7c0

examples/batch-processing/server.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
"""llama-cpp-python server from scratch in a single file.
2+
"""
3+
4+
# import llama_cpp
5+
6+
# path = b"../../models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf"
7+
8+
# model_params = llama_cpp.llama_model_default_params()
9+
# model = llama_cpp.llama_load_model_from_file(path, model_params)
10+
11+
# if model is None:
12+
# raise RuntimeError(f"Failed to load model from file: {path}")
13+
14+
15+
# ctx_params = llama_cpp.llama_context_default_params()
16+
# ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)
17+
18+
# if ctx is None:
19+
# raise RuntimeError("Failed to create context")
20+
21+
22+
from fastapi import FastAPI
23+
24+
app = FastAPI()
25+
26+
import openai.types.chat as types
27+
28+
@app.post("/v1/chat/completions")
29+
def create_chat_completions():
30+
return {"message": "Hello World"}

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.60"
4+
__version__ = "0.2.61"

llama_cpp/llama.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1688,7 +1688,8 @@ def create_chat_completion(
16881688
top_k=top_k,
16891689
min_p=min_p,
16901690
typical_p=typical_p,
1691-
logprobs=top_logprobs if logprobs else None,
1691+
logprobs=logprobs,
1692+
top_logprobs=top_logprobs,
16921693
stream=stream,
16931694
stop=stop,
16941695
seed=seed,

llama_cpp/llama_chat_format.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ def __call__(
7878
mirostat_eta: float = 0.1,
7979
logits_processor: Optional[llama.LogitsProcessorList] = None,
8080
grammar: Optional[llama.LlamaGrammar] = None,
81+
logprobs: Optional[bool] = None,
82+
top_logprobs: Optional[int] = None,
8183
**kwargs, # type: ignore
8284
) -> Union[
8385
llama_types.CreateChatCompletionResponse,
@@ -339,7 +341,7 @@ def _convert_completion_to_chat_function(
339341
}
340342
],
341343
},
342-
"logprobs": None,
344+
"logprobs": completion["choices"][0]["logprobs"],
343345
"finish_reason": "tool_calls",
344346
}
345347
],
@@ -392,7 +394,7 @@ def _stream_response_to_function_stream(
392394
{
393395
"index": 0,
394396
"finish_reason": None,
395-
"logprobs": None,
397+
"logprobs": chunk["choices"][0]["logprobs"],
396398
"delta": {
397399
"role": None,
398400
"content": None,
@@ -427,7 +429,7 @@ def _stream_response_to_function_stream(
427429
{
428430
"index": 0,
429431
"finish_reason": None,
430-
"logprobs": None,
432+
"logprobs": chunk["choices"][0]["logprobs"],
431433
"delta": {
432434
"role": None,
433435
"content": None,
@@ -492,7 +494,6 @@ def chat_completion_handler(
492494
temperature: float = 0.2,
493495
top_p: float = 0.95,
494496
top_k: int = 40,
495-
logprobs: int = 0,
496497
min_p: float = 0.05,
497498
typical_p: float = 1.0,
498499
stream: bool = False,
@@ -514,6 +515,8 @@ def chat_completion_handler(
514515
logits_processor: Optional[llama.LogitsProcessorList] = None,
515516
grammar: Optional[llama.LlamaGrammar] = None,
516517
logit_bias: Optional[Dict[str, float]] = None,
518+
logprobs: Optional[bool] = None,
519+
top_logprobs: Optional[int] = None,
517520
**kwargs, # type: ignore
518521
) -> Union[
519522
llama_types.CreateChatCompletionResponse,
@@ -583,7 +586,7 @@ def chat_completion_handler(
583586
top_k=top_k,
584587
min_p=min_p,
585588
typical_p=typical_p,
586-
logprobs=logprobs,
589+
logprobs=top_logprobs if logprobs else None,
587590
stream=stream,
588591
stop=stop,
589592
seed=seed,
@@ -1634,7 +1637,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
16341637
}
16351638
],
16361639
},
1637-
"logprobs": None,
1640+
"logprobs": completion["choices"][0]["logprobs"],
16381641
"finish_reason": "tool_calls",
16391642
}
16401643
],
@@ -2094,7 +2097,7 @@ def create_completion(stop):
20942097
choices=[
20952098
{
20962099
"index": 0,
2097-
"logprobs": None,
2100+
"logprobs": completion["choices"][0]["logprobs"],
20982101
"message": {
20992102
"role": "assistant",
21002103
"content": None if content == "" else content,
@@ -2323,11 +2326,14 @@ def chatml_function_calling(
23232326
model: Optional[str] = None,
23242327
logits_processor: Optional[llama.LogitsProcessorList] = None,
23252328
grammar: Optional[llama.LlamaGrammar] = None,
2329+
logprobs: Optional[bool] = None,
2330+
top_logprobs: Optional[int] = None,
23262331
**kwargs, # type: ignore
23272332
) -> Union[
23282333
llama_types.CreateChatCompletionResponse,
23292334
Iterator[llama_types.CreateChatCompletionStreamResponse],
23302335
]:
2336+
print(logprobs)
23312337
function_calling_template = (
23322338
"{% for message in messages %}"
23332339
"<|im_start|>{{ message.role }}\n"
@@ -2450,6 +2456,7 @@ def chatml_function_calling(
24502456
model=model,
24512457
logits_processor=logits_processor,
24522458
grammar=grammar,
2459+
logprobs=top_logprobs if logprobs else None,
24532460
),
24542461
stream=stream,
24552462
)
@@ -2564,6 +2571,7 @@ def chatml_function_calling(
25642571
typical_p=typical_p,
25652572
stream=stream,
25662573
stop=["<|im_end|>"],
2574+
logprobs=top_logprobs if logprobs else None,
25672575
max_tokens=None,
25682576
min_tokens=min_tokens,
25692577
presence_penalty=presence_penalty,
@@ -2678,7 +2686,7 @@ def chatml_function_calling(
26782686
{
26792687
"finish_reason": "tool_calls",
26802688
"index": 0,
2681-
"logprobs": None,
2689+
"logprobs": completion["choices"][0]["logprobs"],
26822690
"message": {
26832691
"role": "assistant",
26842692
"content": None,

0 commit comments

Comments
 (0)