Skip to content

Commit e16a1af

Browse files
committed
Reinstate refactor Lora adapter support
1 parent 87f3ef9 commit e16a1af

File tree

11 files changed

+942
-511
lines changed

11 files changed

+942
-511
lines changed

common/common.cpp

+3-10
Original file line numberDiff line numberDiff line change
@@ -686,15 +686,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
686686
if (arg == "--lora") {
687687
CHECK_ARG
688688
params.lora_adapter.emplace_back(argv[i], 1.0f);
689-
params.use_mmap = false;
690689
return true;
691690
}
692691
if (arg == "--lora-scaled") {
693692
CHECK_ARG
694693
const char* lora_adapter = argv[i];
695694
CHECK_ARG
696695
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
697-
params.use_mmap = false;
698696
return true;
699697
}
700698
if (arg == "--lora-base") {
@@ -2114,19 +2112,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
21142112
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
21152113
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
21162114
float lora_scale = std::get<1>(params.lora_adapter[i]);
2117-
int err = llama_model_apply_lora_from_file(model,
2118-
lora_adapter.c_str(),
2119-
lora_scale,
2120-
((i > 0) || params.lora_base.empty())
2121-
? NULL
2122-
: params.lora_base.c_str(),
2123-
params.n_threads);
2124-
if (err != 0) {
2115+
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
2116+
if (adapter == nullptr) {
21252117
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
21262118
llama_free(lctx);
21272119
llama_free_model(model);
21282120
return std::make_tuple(nullptr, nullptr);
21292121
}
2122+
llama_lora_adapter_set(lctx, adapter, lora_scale);
21302123
}
21312124

21322125
if (params.ignore_eos) {

convert_hf_to_gguf.py

+12-22
Original file line numberDiff line numberDiff line change
@@ -2342,13 +2342,6 @@ def set_vocab(self):
23422342

23432343
special_vocab.add_to_gguf(self.gguf_writer)
23442344

2345-
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
2346-
if n_head_kv is not None and n_head != n_head_kv:
2347-
n_head = n_head_kv
2348-
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
2349-
.swapaxes(1, 2)
2350-
.reshape(weights.shape))
2351-
23522345
def set_gguf_parameters(self):
23532346
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
23542347
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
@@ -2367,26 +2360,22 @@ def set_gguf_parameters(self):
23672360
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
23682361
num_heads = self.hparams["num_attention_heads"]
23692362
num_kv_heads = self.hparams["num_key_value_heads"]
2370-
hidden_size = self.hparams["hidden_size"]
2363+
n_embd = self.hparams["hidden_size"]
23712364
q_per_kv = num_heads // num_kv_heads
2372-
head_dim = hidden_size // num_heads
2365+
head_dim = n_embd // num_heads
23732366
num_groups = num_heads // q_per_kv
23742367

2375-
qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
2376-
2377-
if re.match(qkv_pattern, name):
2378-
bid = re.findall(qkv_pattern, name)[0]
2368+
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
23792369
qkv = data_torch
2380-
# qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
2381-
qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim))
2382-
q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
2370+
2371+
qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
2372+
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
2373+
23832374
# The model weights of q and k equire additional reshape.
2384-
# q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
2385-
q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads)
2386-
# k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
2387-
k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads)
2388-
# v = rearrange(v, " o g n i -> o (g n i)").T
2389-
v = v.reshape((v.shape[0], -1)).T
2375+
q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
2376+
k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
2377+
v = v.reshape((-1, v.shape[-1]))
2378+
23902379
return [
23912380
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
23922381
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
@@ -3695,6 +3684,7 @@ def main() -> None:
36953684
small_first_shard=args.no_tensor_first_split)
36963685

36973686
logger.info("Set model parameters")
3687+
model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL)
36983688
model_instance.set_gguf_parameters()
36993689

37003690
logger.info("Set model tokenizer")

0 commit comments

Comments
 (0)