Merge b3173

Nexesenex · web-flow · commit 76266907d153 · 2024-06-18T00:32:33.000+02:00
b3173
diff --git a/README.md b/README.md
@@ -387,6 +387,30 @@ brew install llama.cpp
 ```
 The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
 
+### Nix
+
+On Mac and Linux, the Nix package manager can be used via
+```
+nix profile install nixpkgs#llama-cpp
+```
+For flake enabled installs.
+
+Or
+```
+nix-env --file '<nixpkgs>' --install --attr llama-cpp
+```
+For non-flake enabled installs.
+
+This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
+
+#### Flox
+
+On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
+```
+flox install llama-cpp
+```
+Flox follows the nixpkgs build of llama.cpp.
+
 ### Metal Build
 
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -1632,6 +1632,12 @@ def set_gguf_parameters(self):
         super().set_gguf_parameters()
         if (n_experts := self.hparams.get("num_experts")) is not None:
             self.gguf_writer.add_expert_count(n_experts)
+        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
+            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
+            logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
+        if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
+            self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
+            logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
 
     _experts: list[dict[str, Tensor]] | None = None
 
diff --git a/ggml-backend.c b/ggml-backend.c
@@ -1172,7 +1172,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
             // check if a backend with higher prio wants to offload the op
             if (src_backend_id == sched->n_backends - 1) {
                 for (int b = 0; b < src_backend_id; b++) {
-                    if (ggml_backend_offload_op(sched->backends[b], tensor)) {
+                    if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                         SET_CAUSE(tensor, "1.off");
                         return b;
                     }
diff --git a/ggml-impl.h b/ggml-impl.h
@@ -17,7 +17,7 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
-#if defined(_WIN32)
+#if defined(_MSC_VER)
 
 #define m512bh(p) p
 #define m512i(p) p
diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp
@@ -73,9 +73,13 @@ struct rpc_tensor {
     uint64_t view_offs;
     uint64_t data;
     char name[GGML_MAX_NAME];
+
+    char padding[4];
 };
 #pragma pack(pop)
 
+static_assert(sizeof(rpc_tensor) % 8 == 0, "rpc_tensor size must be multiple of 8");
+
 // RPC commands
 enum rpc_cmd {
     ALLOC_BUFFER = 0,
@@ -599,9 +603,8 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
     int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
     output.resize(output_size, 0);
     memcpy(output.data(), &n_nodes, sizeof(n_nodes));
-    uint64_t * out_nodes = (uint64_t *)(output.data() + sizeof(n_nodes));
     for (uint32_t i = 0; i < n_nodes; i++) {
-        out_nodes[i] = reinterpret_cast<uint64_t>(cgraph->nodes[i]);
+        memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
     }
     uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
     *out_ntensors = n_tensors;
@@ -1036,7 +1039,9 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
     }
     std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
     for (uint32_t i = 0; i < n_nodes; i++) {
-        graph->nodes[i] = create_node(nodes[i], ctx, tensor_ptrs, tensor_map);
+        int64_t id;
+        memcpy(&id, &nodes[i], sizeof(id));
+        graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
     }
     ggml_status status = ggml_backend_graph_compute(backend, graph);
     // output serialization format: | status (1 byte) |
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -33,21 +33,22 @@ class General:
         FILE_TYPE            = "general.file_type"
 
     class LLM:
-        VOCAB_SIZE                 = "{arch}.vocab_size"
-        CONTEXT_LENGTH             = "{arch}.context_length"
-        EMBEDDING_LENGTH           = "{arch}.embedding_length"
-        BLOCK_COUNT                = "{arch}.block_count"
-        LEADING_DENSE_BLOCK_COUNT  = "{arch}.leading_dense_block_count"
-        FEED_FORWARD_LENGTH        = "{arch}.feed_forward_length"
-        EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length"
-        USE_PARALLEL_RESIDUAL      = "{arch}.use_parallel_residual"
-        TENSOR_DATA_LAYOUT         = "{arch}.tensor_data_layout"
-        EXPERT_COUNT               = "{arch}.expert_count"
-        EXPERT_USED_COUNT          = "{arch}.expert_used_count"
-        EXPERT_SHARED_COUNT        = "{arch}.expert_shared_count"
-        EXPERT_WEIGHTS_SCALE       = "{arch}.expert_weights_scale"
-        POOLING_TYPE               = "{arch}.pooling_type"
-        LOGIT_SCALE                = "{arch}.logit_scale"
+        VOCAB_SIZE                        = "{arch}.vocab_size"
+        CONTEXT_LENGTH                    = "{arch}.context_length"
+        EMBEDDING_LENGTH                  = "{arch}.embedding_length"
+        BLOCK_COUNT                       = "{arch}.block_count"
+        LEADING_DENSE_BLOCK_COUNT         = "{arch}.leading_dense_block_count"
+        FEED_FORWARD_LENGTH               = "{arch}.feed_forward_length"
+        EXPERT_FEED_FORWARD_LENGTH        = "{arch}.expert_feed_forward_length"
+        EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length"
+        USE_PARALLEL_RESIDUAL             = "{arch}.use_parallel_residual"
+        TENSOR_DATA_LAYOUT                = "{arch}.tensor_data_layout"
+        EXPERT_COUNT                      = "{arch}.expert_count"
+        EXPERT_USED_COUNT                 = "{arch}.expert_used_count"
+        EXPERT_SHARED_COUNT               = "{arch}.expert_shared_count"
+        EXPERT_WEIGHTS_SCALE              = "{arch}.expert_weights_scale"
+        POOLING_TYPE                      = "{arch}.pooling_type"
+        LOGIT_SCALE                       = "{arch}.logit_scale"
 
     class Attention:
         HEAD_COUNT        = "{arch}.attention.head_count"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -394,6 +394,9 @@ def add_feed_forward_length(self, length: int) -> None:
     def add_expert_feed_forward_length(self, length: int) -> None:
         self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
 
+    def add_expert_shared_feed_forward_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+
     def add_parallel_residual(self, use: bool) -> None:
         self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
 
diff --git a/llama.cpp b/llama.cpp

Original file line number	Diff line number	Diff line change
`@@ -1172,7 +1172,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st`
`1172`	`1172`	`// check if a backend with higher prio wants to offload the op`
`1173`	`1173`	`if (src_backend_id == sched->n_backends - 1) {`
`1174`	`1174`	`for (int b = 0; b < src_backend_id; b++) {`
`1175`		`- if (ggml_backend_offload_op(sched->backends[b], tensor)) {`
	`1175`	`+ if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {`
`1176`	`1176`	`SET_CAUSE(tensor, "1.off");`
`1177`	`1177`	`return b;`
`1178`	`1178`	`}`