Skip to content

Commit 7626690

Browse files
authored
Merge b3173
b3173
2 parents 006167a + a94e6ff commit 7626690

File tree

8 files changed

+305
-48
lines changed

8 files changed

+305
-48
lines changed

README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,30 @@ brew install llama.cpp
387387
```
388388
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
389389
390+
### Nix
391+
392+
On Mac and Linux, the Nix package manager can be used via
393+
```
394+
nix profile install nixpkgs#llama-cpp
395+
```
396+
For flake enabled installs.
397+
398+
Or
399+
```
400+
nix-env --file '<nixpkgs>' --install --attr llama-cpp
401+
```
402+
For non-flake enabled installs.
403+
404+
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
405+
406+
#### Flox
407+
408+
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
409+
```
410+
flox install llama-cpp
411+
```
412+
Flox follows the nixpkgs build of llama.cpp.
413+
390414
### Metal Build
391415
392416
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.

convert-hf-to-gguf.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1632,6 +1632,12 @@ def set_gguf_parameters(self):
16321632
super().set_gguf_parameters()
16331633
if (n_experts := self.hparams.get("num_experts")) is not None:
16341634
self.gguf_writer.add_expert_count(n_experts)
1635+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
1636+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
1637+
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
1638+
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
1639+
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
1640+
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
16351641

16361642
_experts: list[dict[str, Tensor]] | None = None
16371643

ggml-backend.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1172,7 +1172,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
11721172
// check if a backend with higher prio wants to offload the op
11731173
if (src_backend_id == sched->n_backends - 1) {
11741174
for (int b = 0; b < src_backend_id; b++) {
1175-
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
1175+
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
11761176
SET_CAUSE(tensor, "1.off");
11771177
return b;
11781178
}

ggml-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
#define MIN(a, b) ((a) < (b) ? (a) : (b))
1818
#define MAX(a, b) ((a) > (b) ? (a) : (b))
1919

20-
#if defined(_WIN32)
20+
#if defined(_MSC_VER)
2121

2222
#define m512bh(p) p
2323
#define m512i(p) p

ggml-rpc.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,13 @@ struct rpc_tensor {
7373
uint64_t view_offs;
7474
uint64_t data;
7575
char name[GGML_MAX_NAME];
76+
77+
char padding[4];
7678
};
7779
#pragma pack(pop)
7880

81+
static_assert(sizeof(rpc_tensor) % 8 == 0, "rpc_tensor size must be multiple of 8");
82+
7983
// RPC commands
8084
enum rpc_cmd {
8185
ALLOC_BUFFER = 0,
@@ -599,9 +603,8 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
599603
int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
600604
output.resize(output_size, 0);
601605
memcpy(output.data(), &n_nodes, sizeof(n_nodes));
602-
uint64_t * out_nodes = (uint64_t *)(output.data() + sizeof(n_nodes));
603606
for (uint32_t i = 0; i < n_nodes; i++) {
604-
out_nodes[i] = reinterpret_cast<uint64_t>(cgraph->nodes[i]);
607+
memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
605608
}
606609
uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
607610
*out_ntensors = n_tensors;
@@ -1036,7 +1039,9 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
10361039
}
10371040
std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
10381041
for (uint32_t i = 0; i < n_nodes; i++) {
1039-
graph->nodes[i] = create_node(nodes[i], ctx, tensor_ptrs, tensor_map);
1042+
int64_t id;
1043+
memcpy(&id, &nodes[i], sizeof(id));
1044+
graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
10401045
}
10411046
ggml_status status = ggml_backend_graph_compute(backend, graph);
10421047
// output serialization format: | status (1 byte) |

gguf-py/gguf/constants.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -33,21 +33,22 @@ class General:
3333
FILE_TYPE = "general.file_type"
3434

3535
class LLM:
36-
VOCAB_SIZE = "{arch}.vocab_size"
37-
CONTEXT_LENGTH = "{arch}.context_length"
38-
EMBEDDING_LENGTH = "{arch}.embedding_length"
39-
BLOCK_COUNT = "{arch}.block_count"
40-
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
41-
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
42-
EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length"
43-
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
44-
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
45-
EXPERT_COUNT = "{arch}.expert_count"
46-
EXPERT_USED_COUNT = "{arch}.expert_used_count"
47-
EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
48-
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
49-
POOLING_TYPE = "{arch}.pooling_type"
50-
LOGIT_SCALE = "{arch}.logit_scale"
36+
VOCAB_SIZE = "{arch}.vocab_size"
37+
CONTEXT_LENGTH = "{arch}.context_length"
38+
EMBEDDING_LENGTH = "{arch}.embedding_length"
39+
BLOCK_COUNT = "{arch}.block_count"
40+
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
41+
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
42+
EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length"
43+
EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length"
44+
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
45+
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
46+
EXPERT_COUNT = "{arch}.expert_count"
47+
EXPERT_USED_COUNT = "{arch}.expert_used_count"
48+
EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
49+
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
50+
POOLING_TYPE = "{arch}.pooling_type"
51+
LOGIT_SCALE = "{arch}.logit_scale"
5152

5253
class Attention:
5354
HEAD_COUNT = "{arch}.attention.head_count"

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,9 @@ def add_feed_forward_length(self, length: int) -> None:
394394
def add_expert_feed_forward_length(self, length: int) -> None:
395395
self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
396396

397+
def add_expert_shared_feed_forward_length(self, length: int) -> None:
398+
self.add_uint32(Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
399+
397400
def add_parallel_residual(self, use: bool) -> None:
398401
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
399402

0 commit comments

Comments
 (0)