Merge branch 'main' into patch-1

abetlen · web-flow · commit 340e58a458fd · 2024-09-26T00:35:45.000-04:00
diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
@@ -92,7 +92,7 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: "recursive"
-          
+
       - uses: actions/setup-python@v5
         with:
           python-version: "3.9"
@@ -103,6 +103,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install uv
           RUST_LOG=trace python -m uv pip install -e .[all] --verbose
+          python -m uv pip install build
         shell: bash
 
       - name: Install dependencies (Windows)
@@ -113,12 +114,13 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install uv
           python -m uv pip install -e .[all] --verbose
+          python -m uv pip install build
         shell: cmd
-          
+
       - name: Build source distribution
         run: |
           python -m build --sdist
-          
+
       - uses: actions/upload-artifact@v4
         with:
           name: sdist
diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
@@ -72,7 +72,7 @@ jobs:
       - name: VS Integration Cache
         id: vs-integration-cache
         if: runner.os == 'Windows'
-        uses: actions/cache@v4.0.2
+        uses: actions/cache@v4
         with:
           path: ./MSBuildExtensions
           key: cuda-${{ matrix.cuda }}-vs-integration
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
@@ -13,34 +13,36 @@ jobs:
     - uses: actions/checkout@v4
       with:
         submodules: "recursive"
-        
+
     - name: Set up Python
       uses: actions/setup-python@v5
       with:
         python-version: "3.9"
-        
+
     - name: Install dependencies (Linux/MacOS)
       if: runner.os != 'Windows'
       run: |
         python -m pip install --upgrade pip
         python -m pip install uv
         RUST_LOG=trace python -m uv pip install -e .[all] --verbose
+        python -m uv pip install build
       shell: bash
 
     - name: Install dependencies (Windows)
       if: runner.os == 'Windows'
       env:
-        RUST_LOG: trace        
+        RUST_LOG: trace
       run: |
         python -m pip install --upgrade pip
         python -m pip install uv
         python -m uv pip install -e .[all] --verbose
+        python -m uv pip install build
       shell: cmd
-        
+
     - name: Build source distribution
       run: |
         python -m build --sdist
-        
+
     - name: Publish distribution to PyPI
       # TODO: move to tag based releases
       # if: startsWith(github.ref, 'refs/tags')
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -46,7 +46,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: 'pip'
       - name: Restore model cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cache/huggingface/hub
           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
@@ -86,7 +86,7 @@ jobs:
           cache: 'pip'
 
       - name: Restore model cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cache/huggingface/hub
           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
@@ -129,7 +129,7 @@ jobs:
           cache: 'pip'
 
       - name: Restore model cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cache/huggingface/hub
           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
@@ -168,7 +168,7 @@ jobs:
           python-version: "3.9"
 
       - name: Restore model cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cache/huggingface/hub
           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.0]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@ea9c32be71b91b42ecc538bd902e93cbb5fb36cb
+- feat: Enable detokenizing special tokens with special=True by @benniekiss in #1596
+- feat(ci): Speed up CI workflows using uv, add support for CUDA 12.5 wheels by @Smartappli in e529940f45d42ed8aa31334123b8d66bc67b0e78
+- feat: Add loading sharded GGUF files from HuggingFace with Llama.from_pretrained(additional_files=[...]) by @Gnurro in 84c092063e8f222758dd3d60bdb2d1d342ac292e
+- feat: Add option to configure n_ubatch by @abetlen in 6c44a3f36b089239cb6396bb408116aad262c702
+- feat: Update sampling API for llama.cpp. Sampling now uses sampler chain by @abetlen in f8fcb3ea3424bcfba3a5437626a994771a02324b
+- fix: Don't store scores internally unless logits_all=True. Reduces memory requirements for large context by @abetlen in 29afcfdff5e75d7df4c13bad0122c98661d251ab
+- fix: Fix memory allocation of ndarray in by @xu-song in #1704
+- fix: Use system message in og qwen format by @abetlen in 98eb092d3c6e7c142c4ba2faaca6c091718abbb3
+
+
 ## [0.2.90]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@1d1ccce67613674c75c9c7e3fa4c1e24e428ba48
diff --git a/README.md b/README.md
@@ -138,7 +138,7 @@ Where `<cuda-version>` is one of the following:
 - `cu122`: CUDA 12.2
 - `cu123`: CUDA 12.3
 - `cu124`: CUDA 12.4
-- `cu124`: CUDA 12.5
+- `cu125`: CUDA 12.5
 
 For example, to install the CUDA 12.1 wheel:
 
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.90"
+__version__ = "0.3.0"
diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py
@@ -10,17 +10,20 @@
 #     GGML_LOG_LEVEL_WARN  = 2,
 #     GGML_LOG_LEVEL_ERROR = 3,
 #     GGML_LOG_LEVEL_DEBUG = 4,
+#     GGML_LOG_LEVEL_CONT  = 5, // continue previous log
 # };
 GGML_LOG_LEVEL_TO_LOGGING_LEVEL = {
     0: logging.CRITICAL,
     1: logging.INFO,
     2: logging.WARNING,
     3: logging.ERROR,
     4: logging.DEBUG,
+    5: logging.DEBUG,
 }
 
 logger = logging.getLogger("llama-cpp-python")
 
+_last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0]
 
 # typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
 @llama_cpp.llama_log_callback
@@ -29,8 +32,12 @@ def llama_log_callback(
     text: bytes,
     user_data: ctypes.c_void_p,
 ):
+    # TODO: Correctly implement continue previous log
+    global _last_log_level
+    log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level
     if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]:
         print(text.decode("utf-8"), end="", flush=True, file=sys.stderr)
+    _last_log_level = log_level
 
 
 llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0))
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -75,6 +75,7 @@ def __init__(
         seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
         n_ctx: int = 512,
         n_batch: int = 512,
+        n_ubatch: int = 512,
         n_threads: Optional[int] = None,
         n_threads_batch: Optional[int] = None,
         rope_scaling_type: Optional[
@@ -156,6 +157,7 @@ def __init__(
             seed: RNG seed, -1 for random
             n_ctx: Text context, 0 = from model
             n_batch: Prompt processing maximum batch size
+            n_ubatch: Physical batch size
             n_threads: Number of threads to use for generation
             n_threads_batch: Number of threads to use for batch processing
             rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
@@ -309,6 +311,7 @@ def __init__(
         self.context_params = llama_cpp.llama_context_default_params()
         self.context_params.n_ctx = n_ctx
         self.context_params.n_batch = self.n_batch
+        self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
         self.context_params.n_threads = self.n_threads
         self.context_params.n_threads_batch = self.n_threads_batch
         self.context_params.rope_scaling_type = (
@@ -380,6 +383,7 @@ def __init__(
             self.n_batch = min(n_ctx, n_batch)
             self.context_params.n_ctx = self._model.n_ctx_train()
             self.context_params.n_batch = self.n_batch
+            self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
 
         self._ctx = self._stack.enter_context(
             contextlib.closing(
@@ -451,7 +455,7 @@ def free_lora_adapter():
         self.n_tokens = 0
         self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
         self.scores: npt.NDArray[np.single] = np.ndarray(
-            (n_ctx, self._n_vocab), dtype=np.single
+            (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single
         )
 
         self._mirostat_mu = ctypes.c_float(
@@ -648,12 +652,14 @@ def eval(self, tokens: Sequence[int]):
                 )
                 self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
             else:
-                rows = 1
-                cols = self._n_vocab
-                logits = np.ctypeslib.as_array(
-                    self._ctx.get_logits(), shape=(rows * cols,)
-                )
-                self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits
+                # rows = 1
+                # cols = self._n_vocab
+                # logits = np.ctypeslib.as_array(
+                #     self._ctx.get_logits(), shape=(rows * cols,)
+                # )
+                # self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits
+                # NOTE: Now that sampling is done inside the sampler, logits are only needed for logprobs which requires logits_all
+                pass
             # Update n_tokens
             self.n_tokens += n_tokens
 
@@ -801,8 +807,10 @@ def sample(
                 grammar=grammar,
             )
 
+        ridx = idx - self.n_tokens if idx is not None else -1
+
         assert self.ctx is not None
-        token = self._sampler.sample(self._ctx, -1)
+        token = self._sampler.sample(self._ctx, ridx)
         if tmp_sampler:
             self._sampler = None
         return token
@@ -2069,6 +2077,7 @@ def __getstate__(self):
             seed=self.context_params.seed,
             n_ctx=self.context_params.n_ctx,
             n_batch=self.n_batch,
+            n_ubatch=self.context_params.n_ubatch,
             n_threads=self.context_params.n_threads,
             n_threads_batch=self.context_params.n_threads_batch,
             rope_scaling_type=self.context_params.rope_scaling_type,
@@ -2225,6 +2234,7 @@ def from_pretrained(
         cls,
         repo_id: str,
         filename: Optional[str],
+        additional_files: Optional[List] = None,
         local_dir: Optional[Union[str, os.PathLike[str]]] = None,
         local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
         cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
@@ -2237,6 +2247,7 @@ def from_pretrained(
         Args:
             repo_id: The model repo id.
             filename: A filename or glob pattern to match the model file in the repo.
+            additional_files: A list of filenames or glob patterns to match additional model files in the repo.
             local_dir: The local directory to save the model to.
             local_dir_use_symlinks: Whether to use symlinks when downloading the model.
             **kwargs: Additional keyword arguments to pass to the Llama constructor.
@@ -2267,6 +2278,7 @@ def from_pretrained(
             rel_path = Path(file).relative_to(repo_id)
             file_list.append(str(rel_path))
 
+        # find the only/first shard file:
         matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)]  # type: ignore
 
         if len(matching_files) == 0:
@@ -2296,6 +2308,35 @@ def from_pretrained(
             cache_dir=cache_dir,
         )
 
+        if additional_files:
+            for additonal_file_name in additional_files:
+                # find the additional shard file:
+                matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
+
+                if len(matching_additional_files) == 0:
+                    raise ValueError(
+                        f"No file found in {repo_id} that match {additonal_file_name}\n\n"
+                        f"Available Files:\n{json.dumps(file_list)}"
+                    )
+
+                if len(matching_additional_files) > 1:
+                    raise ValueError(
+                        f"Multiple files found in {repo_id} matching {additonal_file_name}\n\n"
+                        f"Available Files:\n{json.dumps(files)}"
+                    )
+
+                (matching_additional_file,) = matching_additional_files
+
+                # download the additional file
+                hf_hub_download(
+                    repo_id=repo_id,
+                    filename=matching_additional_file,
+                    subfolder=subfolder,
+                    local_dir=local_dir,
+                    local_dir_use_symlinks=local_dir_use_symlinks,
+                    cache_dir=cache_dir,
+                )
+
         if local_dir is None:
             model_path = hf_hub_download(
                 repo_id=repo_id,
@@ -2309,6 +2350,7 @@ def from_pretrained(
         else:
             model_path = os.path.join(local_dir, filename)
 
+        # loading the first file of a sharded GGUF loads all remaining shard files in the subfolder
         return cls(
             model_path=model_path,
             **kwargs,
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -249,6 +249,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             seed=settings.seed,
             n_ctx=settings.n_ctx,
             n_batch=settings.n_batch,
+            n_ubatch=settings.n_ubatch,
             n_threads=settings.n_threads,
             n_threads_batch=settings.n_threads_batch,
             rope_scaling_type=settings.rope_scaling_type,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -70,6 +70,9 @@ class ModelSettings(BaseSettings):
     n_batch: int = Field(
         default=512, ge=1, description="The batch size to use per eval."
     )
+    n_ubatch: int = Field(
+        default=512, ge=1, description="The physical batch size used by llama.cpp"
+    )
     n_threads: int = Field(
         default=max(multiprocessing.cpu_count() // 2, 1),
         ge=1,
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 64c6af3195c3cd4aa3328a1282d29cd2635c34c9
+Subproject commit ea9c32be71b91b42ecc538bd902e93cbb5fb36cb