Skip to content

Commit ee73aec

Browse files
authored
Merge branch 'main' into patch-1
2 parents 478a770 + 7aaf701 commit ee73aec

10 files changed

+712
-986
lines changed

.github/workflows/build-and-release.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
python -m pip install -e .[all]
3030
3131
- name: Build wheels
32-
uses: pypa/cibuildwheel@v2.19.2
32+
uses: pypa/cibuildwheel@v2.20.0
3333
env:
3434
# disable repair
3535
CIBW_REPAIR_WHEEL_COMMAND: ""
@@ -56,7 +56,7 @@ jobs:
5656
platforms: linux/arm64
5757

5858
- name: Build wheels
59-
uses: pypa/cibuildwheel@v2.19.2
59+
uses: pypa/cibuildwheel@v2.20.0
6060
env:
6161
CIBW_SKIP: "*musllinux* pp*"
6262
CIBW_REPAIR_WHEEL_COMMAND: ""

.github/workflows/build-wheels-metal.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030
python -m pip install -e .[all]
3131
3232
- name: Build wheels
33-
uses: pypa/cibuildwheel@v2.19.2
33+
uses: pypa/cibuildwheel@v2.20.0
3434
env:
3535
# disable repair
3636
CIBW_REPAIR_WHEEL_COMMAND: ""

.github/workflows/generate-index-from-release.yaml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
name: Wheels Index
22

33
on:
4-
# Trigger on any new release
5-
release:
6-
types: [published]
4+
# Trigger on new release
5+
workflow_run:
6+
workflows: ["Release", "Build Wheels (CUDA)", "Build Wheels (Metal)"]
7+
types:
8+
- completed
79

810
# Allows you to run this workflow manually from the Actions tab
911
workflow_dispatch:

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.2.87]
11+
12+
- feat: Update llama.cpp to ggerganov/llama.cpp@be55695eff44784a141a863f273661a6bce63dfc
13+
- fix: Include all llama.cpp source files and subdirectories by @abetlen in 9cad5714ae6e7c250af8d0bbb179f631368c928b
14+
- feat(ci): Re-build wheel index automatically when releases are created by @abetlen in 198f47dc1bd202fd2b71b29e041a9f33fe40bfad
15+
16+
## [0.2.86]
17+
18+
- feat: Update llama.cpp to ggerganov/llama.cpp@398ede5efeb07b9adf9fbda7ea63f630d476a792
19+
- feat: Ported back new grammar changes from C++ to Python implementation by @ExtReMLapin in (#1637)
20+
- fix: llama_grammar_accept_token arg order by @tc-wolf in (#1649)
21+
1022
## [0.2.85]
1123

1224
- feat: Update llama.cpp to ggerganov/llama.cpp@398ede5efeb07b9adf9fbda7ea63f630d476a792

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.85"
4+
__version__ = "0.2.87"

llama_cpp/llama.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -777,11 +777,12 @@ def generate(
777777
else:
778778
break
779779
if longest_prefix > 0:
780-
if self.verbose:
781-
print("Llama.generate: prefix-match hit", file=sys.stderr)
782780
reset = False
783781
tokens = tokens[longest_prefix:]
784782
self.n_tokens = longest_prefix
783+
if self.verbose:
784+
print(f"Llama.generate: {longest_prefix} prefix-match hit, "
785+
f"remaining {len(tokens)} prompt tokens to eval", file=sys.stderr)
785786

786787
# Reset the model state
787788
if reset:
@@ -2159,7 +2160,7 @@ def from_pretrained(
21592160

21602161
files = [
21612162
file["name"] if isinstance(file, dict) else file
2162-
for file in hffs.ls(repo_id)
2163+
for file in hffs.ls(repo_id, recursive=True)
21632164
]
21642165

21652166
# split each file into repo_id, subfolder, filename

llama_cpp/llama_cpp.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -944,7 +944,7 @@ class llama_context_params(ctypes.Structure):
944944
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
945945
# enum llama_ftype ftype; // quantize to this llama_ftype
946946
# enum ggml_type output_tensor_type; // output tensor type
947-
# enum ggml_type token_embedding_type; // itoken embeddings tensor type
947+
# enum ggml_type token_embedding_type; // token embeddings tensor type
948948
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
949949
# bool quantize_output_tensor; // quantize output.weight
950950
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@@ -960,7 +960,7 @@ class llama_model_quantize_params(ctypes.Structure):
960960
nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
961961
ftype (int): quantize to this llama_ftype
962962
output_tensor_type (int): output tensor type
963-
token_embedding_type (int): itoken embeddings tensor type
963+
token_embedding_type (int): token embeddings tensor type
964964
allow_requantize (bool): allow quantizing non-f32/f16 tensors
965965
quantize_output_tensor (bool): quantize output.weight
966966
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@@ -3002,7 +3002,7 @@ def llama_grammar_init(
30023002
n_rules: Union[ctypes.c_size_t, int],
30033003
start_rule_index: Union[ctypes.c_size_t, int],
30043004
/,
3005-
) -> llama_grammar_p:
3005+
) -> Optional[llama_grammar_p]:
30063006
"""Initialize a grammar from a set of rules."""
30073007
...
30083008

0 commit comments

Comments
 (0)