Skip to content

Commit 340e58a

Browse files
authored
Merge branch 'main' into patch-1
2 parents cfe98dd + 9992c50 commit 340e58a

File tree

12 files changed

+94
-24
lines changed

12 files changed

+94
-24
lines changed

.github/workflows/build-and-release.yaml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ jobs:
9292
- uses: actions/checkout@v4
9393
with:
9494
submodules: "recursive"
95-
95+
9696
- uses: actions/setup-python@v5
9797
with:
9898
python-version: "3.9"
@@ -103,6 +103,7 @@ jobs:
103103
python -m pip install --upgrade pip
104104
python -m pip install uv
105105
RUST_LOG=trace python -m uv pip install -e .[all] --verbose
106+
python -m uv pip install build
106107
shell: bash
107108

108109
- name: Install dependencies (Windows)
@@ -113,12 +114,13 @@ jobs:
113114
python -m pip install --upgrade pip
114115
python -m pip install uv
115116
python -m uv pip install -e .[all] --verbose
117+
python -m uv pip install build
116118
shell: cmd
117-
119+
118120
- name: Build source distribution
119121
run: |
120122
python -m build --sdist
121-
123+
122124
- uses: actions/upload-artifact@v4
123125
with:
124126
name: sdist

.github/workflows/build-wheels-cuda.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
- name: VS Integration Cache
7373
id: vs-integration-cache
7474
if: runner.os == 'Windows'
75-
uses: actions/cache@v4.0.2
75+
uses: actions/cache@v4
7676
with:
7777
path: ./MSBuildExtensions
7878
key: cuda-${{ matrix.cuda }}-vs-integration

.github/workflows/publish.yaml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,34 +13,36 @@ jobs:
1313
- uses: actions/checkout@v4
1414
with:
1515
submodules: "recursive"
16-
16+
1717
- name: Set up Python
1818
uses: actions/setup-python@v5
1919
with:
2020
python-version: "3.9"
21-
21+
2222
- name: Install dependencies (Linux/MacOS)
2323
if: runner.os != 'Windows'
2424
run: |
2525
python -m pip install --upgrade pip
2626
python -m pip install uv
2727
RUST_LOG=trace python -m uv pip install -e .[all] --verbose
28+
python -m uv pip install build
2829
shell: bash
2930

3031
- name: Install dependencies (Windows)
3132
if: runner.os == 'Windows'
3233
env:
33-
RUST_LOG: trace
34+
RUST_LOG: trace
3435
run: |
3536
python -m pip install --upgrade pip
3637
python -m pip install uv
3738
python -m uv pip install -e .[all] --verbose
39+
python -m uv pip install build
3840
shell: cmd
39-
41+
4042
- name: Build source distribution
4143
run: |
4244
python -m build --sdist
43-
45+
4446
- name: Publish distribution to PyPI
4547
# TODO: move to tag based releases
4648
# if: startsWith(github.ref, 'refs/tags')

.github/workflows/test.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ jobs:
4646
python-version: ${{ matrix.python-version }}
4747
cache: 'pip'
4848
- name: Restore model cache
49-
uses: actions/cache@v3
49+
uses: actions/cache@v4
5050
with:
5151
path: ~/.cache/huggingface/hub
5252
key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
@@ -86,7 +86,7 @@ jobs:
8686
cache: 'pip'
8787

8888
- name: Restore model cache
89-
uses: actions/cache@v3
89+
uses: actions/cache@v4
9090
with:
9191
path: ~/.cache/huggingface/hub
9292
key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
@@ -129,7 +129,7 @@ jobs:
129129
cache: 'pip'
130130

131131
- name: Restore model cache
132-
uses: actions/cache@v3
132+
uses: actions/cache@v4
133133
with:
134134
path: ~/.cache/huggingface/hub
135135
key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
@@ -168,7 +168,7 @@ jobs:
168168
python-version: "3.9"
169169

170170
- name: Restore model cache
171-
uses: actions/cache@v3
171+
uses: actions/cache@v4
172172
with:
173173
path: ~/.cache/huggingface/hub
174174
key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.3.0]
11+
12+
- feat: Update llama.cpp to ggerganov/llama.cpp@ea9c32be71b91b42ecc538bd902e93cbb5fb36cb
13+
- feat: Enable detokenizing special tokens with special=True by @benniekiss in #1596
14+
- feat(ci): Speed up CI workflows using uv, add support for CUDA 12.5 wheels by @Smartappli in e529940f45d42ed8aa31334123b8d66bc67b0e78
15+
- feat: Add loading sharded GGUF files from HuggingFace with Llama.from_pretrained(additional_files=[...]) by @Gnurro in 84c092063e8f222758dd3d60bdb2d1d342ac292e
16+
- feat: Add option to configure n_ubatch by @abetlen in 6c44a3f36b089239cb6396bb408116aad262c702
17+
- feat: Update sampling API for llama.cpp. Sampling now uses sampler chain by @abetlen in f8fcb3ea3424bcfba3a5437626a994771a02324b
18+
- fix: Don't store scores internally unless logits_all=True. Reduces memory requirements for large context by @abetlen in 29afcfdff5e75d7df4c13bad0122c98661d251ab
19+
- fix: Fix memory allocation of ndarray in by @xu-song in #1704
20+
- fix: Use system message in og qwen format by @abetlen in 98eb092d3c6e7c142c4ba2faaca6c091718abbb3
21+
22+
1023
## [0.2.90]
1124

1225
- feat: Update llama.cpp to ggerganov/llama.cpp@1d1ccce67613674c75c9c7e3fa4c1e24e428ba48

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ Where `<cuda-version>` is one of the following:
138138
- `cu122`: CUDA 12.2
139139
- `cu123`: CUDA 12.3
140140
- `cu124`: CUDA 12.4
141-
- `cu124`: CUDA 12.5
141+
- `cu125`: CUDA 12.5
142142

143143
For example, to install the CUDA 12.1 wheel:
144144

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.90"
4+
__version__ = "0.3.0"

llama_cpp/_logger.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,20 @@
1010
# GGML_LOG_LEVEL_WARN = 2,
1111
# GGML_LOG_LEVEL_ERROR = 3,
1212
# GGML_LOG_LEVEL_DEBUG = 4,
13+
# GGML_LOG_LEVEL_CONT = 5, // continue previous log
1314
# };
1415
GGML_LOG_LEVEL_TO_LOGGING_LEVEL = {
1516
0: logging.CRITICAL,
1617
1: logging.INFO,
1718
2: logging.WARNING,
1819
3: logging.ERROR,
1920
4: logging.DEBUG,
21+
5: logging.DEBUG,
2022
}
2123

2224
logger = logging.getLogger("llama-cpp-python")
2325

26+
_last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0]
2427

2528
# typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
2629
@llama_cpp.llama_log_callback
@@ -29,8 +32,12 @@ def llama_log_callback(
2932
text: bytes,
3033
user_data: ctypes.c_void_p,
3134
):
35+
# TODO: Correctly implement continue previous log
36+
global _last_log_level
37+
log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level
3238
if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]:
3339
print(text.decode("utf-8"), end="", flush=True, file=sys.stderr)
40+
_last_log_level = log_level
3441

3542

3643
llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0))

llama_cpp/llama.py

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def __init__(
7575
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
7676
n_ctx: int = 512,
7777
n_batch: int = 512,
78+
n_ubatch: int = 512,
7879
n_threads: Optional[int] = None,
7980
n_threads_batch: Optional[int] = None,
8081
rope_scaling_type: Optional[
@@ -156,6 +157,7 @@ def __init__(
156157
seed: RNG seed, -1 for random
157158
n_ctx: Text context, 0 = from model
158159
n_batch: Prompt processing maximum batch size
160+
n_ubatch: Physical batch size
159161
n_threads: Number of threads to use for generation
160162
n_threads_batch: Number of threads to use for batch processing
161163
rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
@@ -309,6 +311,7 @@ def __init__(
309311
self.context_params = llama_cpp.llama_context_default_params()
310312
self.context_params.n_ctx = n_ctx
311313
self.context_params.n_batch = self.n_batch
314+
self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
312315
self.context_params.n_threads = self.n_threads
313316
self.context_params.n_threads_batch = self.n_threads_batch
314317
self.context_params.rope_scaling_type = (
@@ -380,6 +383,7 @@ def __init__(
380383
self.n_batch = min(n_ctx, n_batch)
381384
self.context_params.n_ctx = self._model.n_ctx_train()
382385
self.context_params.n_batch = self.n_batch
386+
self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
383387

384388
self._ctx = self._stack.enter_context(
385389
contextlib.closing(
@@ -451,7 +455,7 @@ def free_lora_adapter():
451455
self.n_tokens = 0
452456
self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
453457
self.scores: npt.NDArray[np.single] = np.ndarray(
454-
(n_ctx, self._n_vocab), dtype=np.single
458+
(n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single
455459
)
456460

457461
self._mirostat_mu = ctypes.c_float(
@@ -648,12 +652,14 @@ def eval(self, tokens: Sequence[int]):
648652
)
649653
self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
650654
else:
651-
rows = 1
652-
cols = self._n_vocab
653-
logits = np.ctypeslib.as_array(
654-
self._ctx.get_logits(), shape=(rows * cols,)
655-
)
656-
self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits
655+
# rows = 1
656+
# cols = self._n_vocab
657+
# logits = np.ctypeslib.as_array(
658+
# self._ctx.get_logits(), shape=(rows * cols,)
659+
# )
660+
# self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits
661+
# NOTE: Now that sampling is done inside the sampler, logits are only needed for logprobs which requires logits_all
662+
pass
657663
# Update n_tokens
658664
self.n_tokens += n_tokens
659665

@@ -801,8 +807,10 @@ def sample(
801807
grammar=grammar,
802808
)
803809

810+
ridx = idx - self.n_tokens if idx is not None else -1
811+
804812
assert self.ctx is not None
805-
token = self._sampler.sample(self._ctx, -1)
813+
token = self._sampler.sample(self._ctx, ridx)
806814
if tmp_sampler:
807815
self._sampler = None
808816
return token
@@ -2069,6 +2077,7 @@ def __getstate__(self):
20692077
seed=self.context_params.seed,
20702078
n_ctx=self.context_params.n_ctx,
20712079
n_batch=self.n_batch,
2080+
n_ubatch=self.context_params.n_ubatch,
20722081
n_threads=self.context_params.n_threads,
20732082
n_threads_batch=self.context_params.n_threads_batch,
20742083
rope_scaling_type=self.context_params.rope_scaling_type,
@@ -2225,6 +2234,7 @@ def from_pretrained(
22252234
cls,
22262235
repo_id: str,
22272236
filename: Optional[str],
2237+
additional_files: Optional[List] = None,
22282238
local_dir: Optional[Union[str, os.PathLike[str]]] = None,
22292239
local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
22302240
cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
@@ -2237,6 +2247,7 @@ def from_pretrained(
22372247
Args:
22382248
repo_id: The model repo id.
22392249
filename: A filename or glob pattern to match the model file in the repo.
2250+
additional_files: A list of filenames or glob patterns to match additional model files in the repo.
22402251
local_dir: The local directory to save the model to.
22412252
local_dir_use_symlinks: Whether to use symlinks when downloading the model.
22422253
**kwargs: Additional keyword arguments to pass to the Llama constructor.
@@ -2267,6 +2278,7 @@ def from_pretrained(
22672278
rel_path = Path(file).relative_to(repo_id)
22682279
file_list.append(str(rel_path))
22692280

2281+
# find the only/first shard file:
22702282
matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore
22712283

22722284
if len(matching_files) == 0:
@@ -2296,6 +2308,35 @@ def from_pretrained(
22962308
cache_dir=cache_dir,
22972309
)
22982310

2311+
if additional_files:
2312+
for additonal_file_name in additional_files:
2313+
# find the additional shard file:
2314+
matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
2315+
2316+
if len(matching_additional_files) == 0:
2317+
raise ValueError(
2318+
f"No file found in {repo_id} that match {additonal_file_name}\n\n"
2319+
f"Available Files:\n{json.dumps(file_list)}"
2320+
)
2321+
2322+
if len(matching_additional_files) > 1:
2323+
raise ValueError(
2324+
f"Multiple files found in {repo_id} matching {additonal_file_name}\n\n"
2325+
f"Available Files:\n{json.dumps(files)}"
2326+
)
2327+
2328+
(matching_additional_file,) = matching_additional_files
2329+
2330+
# download the additional file
2331+
hf_hub_download(
2332+
repo_id=repo_id,
2333+
filename=matching_additional_file,
2334+
subfolder=subfolder,
2335+
local_dir=local_dir,
2336+
local_dir_use_symlinks=local_dir_use_symlinks,
2337+
cache_dir=cache_dir,
2338+
)
2339+
22992340
if local_dir is None:
23002341
model_path = hf_hub_download(
23012342
repo_id=repo_id,
@@ -2309,6 +2350,7 @@ def from_pretrained(
23092350
else:
23102351
model_path = os.path.join(local_dir, filename)
23112352

2353+
# loading the first file of a sharded GGUF loads all remaining shard files in the subfolder
23122354
return cls(
23132355
model_path=model_path,
23142356
**kwargs,

llama_cpp/server/model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
249249
seed=settings.seed,
250250
n_ctx=settings.n_ctx,
251251
n_batch=settings.n_batch,
252+
n_ubatch=settings.n_ubatch,
252253
n_threads=settings.n_threads,
253254
n_threads_batch=settings.n_threads_batch,
254255
rope_scaling_type=settings.rope_scaling_type,

llama_cpp/server/settings.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ class ModelSettings(BaseSettings):
7070
n_batch: int = Field(
7171
default=512, ge=1, description="The batch size to use per eval."
7272
)
73+
n_ubatch: int = Field(
74+
default=512, ge=1, description="The physical batch size used by llama.cpp"
75+
)
7376
n_threads: int = Field(
7477
default=max(multiprocessing.cpu_count() // 2, 1),
7578
ge=1,

vendor/llama.cpp

0 commit comments

Comments
 (0)