Expose libggml and refactor ctypes extension

abetlen · abetlen · commit c94bdc45da1e · 2024-09-25T13:31:31.000-04:00
diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+import sys
+import os
+import ctypes
+import functools
+import pathlib
+
+from typing import (
+    Any,
+    Callable,
+    List,
+    Union,
+    Optional,
+    TYPE_CHECKING,
+    TypeVar,
+    Generic,
+)
+from typing_extensions import TypeAlias
+
+
+# Load the library
+def load_shared_library(lib_base_name: str, base_path: pathlib.Path):
+    """Platform independent shared library loader"""
+    # Searching for the library in the current directory under the name "libllama" (default name
+    # for llamacpp) and "llama" (default name for this repo)
+    lib_paths: List[pathlib.Path] = []
+    # Determine the file extension based on the platform
+    if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"):
+        lib_paths += [
+            base_path / f"lib{lib_base_name}.so",
+        ]
+    elif sys.platform == "darwin":
+        lib_paths += [
+            base_path / f"lib{lib_base_name}.so",
+            base_path / f"lib{lib_base_name}.dylib",
+        ]
+    elif sys.platform == "win32":
+        lib_paths += [
+            base_path / f"{lib_base_name}.dll",
+            base_path / f"lib{lib_base_name}.dll",
+        ]
+    else:
+        raise RuntimeError("Unsupported platform")
+
+    cdll_args = dict()  # type: ignore
+
+    # Add the library directory to the DLL search path on Windows (if needed)
+    if sys.platform == "win32":
+        os.add_dll_directory(str(base_path))
+        os.environ["PATH"] = str(base_path) + os.pathsep + os.environ["PATH"]
+
+    if sys.platform == "win32" and sys.version_info >= (3, 8):
+        os.add_dll_directory(str(base_path))
+        if "CUDA_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
+        if "HIP_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib"))
+        cdll_args["winmode"] = ctypes.RTLD_GLOBAL
+
+    # Try to load the shared library, handling potential errors
+    for lib_path in lib_paths:
+        if lib_path.exists():
+            try:
+                return ctypes.CDLL(str(lib_path), **cdll_args)  # type: ignore
+            except Exception as e:
+                raise RuntimeError(f"Failed to load shared library '{lib_path}': {e}")
+
+    raise FileNotFoundError(
+        f"Shared library with base name '{lib_base_name}' not found"
+    )
+
+
+# ctypes sane type hint helpers
+#
+# - Generic Pointer and Array types
+# - PointerOrRef type with a type hinted byref function
+#
+# NOTE: Only use these for static type checking not for runtime checks
+# no good will come of that
+
+if TYPE_CHECKING:
+    CtypesCData = TypeVar("CtypesCData", bound=ctypes._CData)  # type: ignore
+
+    CtypesArray: TypeAlias = ctypes.Array[CtypesCData]  # type: ignore
+
+    CtypesPointer: TypeAlias = ctypes._Pointer[CtypesCData]  # type: ignore
+
+    CtypesVoidPointer: TypeAlias = ctypes.c_void_p
+
+    class CtypesRef(Generic[CtypesCData]):
+        pass
+
+    CtypesPointerOrRef: TypeAlias = Union[
+        CtypesPointer[CtypesCData], CtypesRef[CtypesCData]
+    ]
+
+    CtypesFuncPointer: TypeAlias = ctypes._FuncPointer  # type: ignore
+
+F = TypeVar("F", bound=Callable[..., Any])
+
+
+def ctypes_function_for_shared_library(lib: ctypes.CDLL):
+    """Decorator for defining ctypes functions with type hints"""
+
+    def ctypes_function(
+        name: str, argtypes: List[Any], restype: Any, enabled: bool = True
+    ):
+        def decorator(f: F) -> F:
+            if enabled:
+                func = getattr(lib, name)
+                func.argtypes = argtypes
+                func.restype = restype
+                functools.wraps(f)(func)
+                return func
+            else:
+                return f
+
+        return decorator
+
+    return ctypes_function
+
+
+def _byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCData]:
+    """Type-annotated version of ctypes.byref"""
+    ...
+
+
+byref = _byref if TYPE_CHECKING else ctypes.byref
diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py
@@ -0,0 +1,143 @@
+"""Internal module use at your own risk
+
+This module provides a minimal interface for working with ggml tensors from llama-cpp-python
+"""
+import os
+import pathlib
+
+import ctypes
+
+import llama_cpp._ctypes_extensions as ctypes_ext
+
+import numpy as np
+
+
+libggml_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
+libggml = ctypes_ext.load_shared_library("ggml", libggml_base_path)
+
+ggml_function = ctypes_ext.ctypes_function_for_shared_library(libggml)
+
+
+# define GGML_MAX_DIMS           4
+GGML_MAX_DIMS = 4
+
+# define GGML_MAX_OP_PARAMS      64
+GGML_MAX_OP_PARAMS = 64
+
+# define GGML_MAX_SRC            10
+GGML_MAX_SRC = 10
+
+# define GGML_MAX_NAME           64
+GGML_MAX_NAME = 64
+
+
+# // n-dimensional tensor
+# struct ggml_tensor {
+#     enum ggml_type         type;
+#
+#     GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
+#
+#     struct ggml_backend_buffer * buffer;
+#
+#     int64_t ne[GGML_MAX_DIMS]; // number of elements
+#     size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
+#                                // nb[0] = ggml_type_size(type)
+#                                // nb[1] = nb[0]   * (ne[0] / ggml_blck_size(type)) + padding
+#                                // nb[i] = nb[i-1] * ne[i-1]
+#
+#     // compute data
+#     enum ggml_op op;
+#
+#     // op params - allocated as int32_t for alignment
+#     int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+#
+#     int32_t flags;
+#
+#     struct ggml_tensor * grad;
+#     struct ggml_tensor * src[GGML_MAX_SRC];
+#
+#     // source tensor and offset for views
+#     struct ggml_tensor * view_src;
+#     size_t               view_offs;
+#
+#     void * data;
+#
+#     char name[GGML_MAX_NAME];
+#
+#     void * extra; // extra things e.g. for ggml-cuda.cu
+#
+#     // char padding[4];
+# };
+class ggml_tensor(ctypes.Structure):
+    __fields__ = [
+        ("type", ctypes.c_int),
+        ("buffer", ctypes.c_void_p),
+        ("ne", ctypes.c_int64 * 8),
+        ("nb", ctypes.c_size_t * 8),
+        ("op", ctypes.c_int),
+        ("op_params", ctypes.c_int32 * 8),
+        ("flags", ctypes.c_int32),
+        ("grad", ctypes.c_void_p),
+        ("src", ctypes.c_void_p * 8),
+        ("view_src", ctypes.c_void_p),
+        ("view_offs", ctypes.c_size_t),
+        ("data", ctypes.c_void_p),
+        ("name", ctypes.c_char * 64),
+        ("extra", ctypes.c_void_p),
+    ]
+
+
+ggml_tensor_p = ctypes_ext.CtypesPointer[ggml_tensor]
+ggml_tensor_p_ctypes = ctypes.POINTER(ggml_tensor)
+
+
+# GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+@ggml_function(
+    "ggml_backend_tensor_get",
+    [ggml_tensor_p_ctypes, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t],
+    ctypes.c_void_p,
+)
+def ggml_backend_tensor_get(
+    tensor: ggml_tensor_p, data: ctypes.c_void_p, offset: int, size: int
+) -> None:
+    ...
+
+
+# GGML_API GGML_CALL size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+@ggml_function(
+    "ggml_nbytes",
+    [ggml_tensor_p_ctypes],
+    ctypes.c_size_t,
+)
+def ggml_nbytes(tensor: ggml_tensor_p) -> int:
+    ...
+
+
+# GGML_API GGML_CALL int64_t ggml_nelements   (const struct ggml_tensor * tensor);
+@ggml_function(
+    "ggml_nelements",
+    [ggml_tensor_p_ctypes],
+    ctypes.c_int64,
+)
+def ggml_nelements(tensor: ggml_tensor_p) -> int:
+    ...
+
+
+# GGML_API           int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
+@ggml_function(
+    "ggml_n_dims",
+    [ggml_tensor_p_ctypes],
+    ctypes.c_int,
+)
+def ggml_n_dims(tensor: ggml_tensor_p) -> int:
+    ...
+
+
+def ggml_tensor_to_numpy(tensor: ggml_tensor_p):
+    nbytes = ggml_nbytes(tensor)
+    nelements = ggml_nelements(tensor)
+    data = np.empty(nelements, dtype=np.float32)
+    ggml_backend_tensor_get(
+        tensor, ctypes.cast(data.ctypes.data, ctypes.c_void_p), 0, nbytes
+    )
+    return data.reshape(tensor.contents.ne[: ggml_n_dims(tensor)])