Skip to content

Commit 7fd6fa8

Browse files
committed
talk-llama : sync llama.cpp
ggml-ci
1 parent 3f46282 commit 7fd6fa8

22 files changed

+4262
-3549
lines changed

examples/talk-llama/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ if (WHISPER_SDL2)
1717
llama-impl.cpp
1818
llama-io.cpp
1919
llama-kv-cache.cpp
20+
llama-kv-cache-unified.cpp
21+
llama-kv-cache-unified-iswa.cpp
22+
llama-kv-cache-recurrent.cpp
2023
llama-memory.cpp
2124
llama-mmap.cpp
2225
llama-model-loader.cpp

examples/talk-llama/llama-arch.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
174174
{ LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
175175
{ LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
176176

177+
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
178+
177179
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
178180
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
179181
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
@@ -448,6 +450,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
448450
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
449451
{ LLM_TENSOR_POS_EMBD, "position_embd" },
450452
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
453+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
451454
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
452455
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
453456
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },

examples/talk-llama/llama-arch.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,8 @@ enum llm_kv {
213213
LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
214214
LLM_KV_CONVNEXT_BLOCK_COUNT,
215215

216+
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
217+
216218
// deprecated:
217219
LLM_KV_TOKENIZER_PREFIX_ID,
218220
LLM_KV_TOKENIZER_SUFFIX_ID,

examples/talk-llama/llama-batch.cpp

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,24 +15,31 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
1515
break;
1616
}
1717
}
18-
ubatch_token.resize(!has_embd ? n_ubatch : 0);
19-
ubatch_embd.resize(has_embd ? n_embd * n_ubatch : 0);
20-
ubatch_pos.resize(n_ubatch);
21-
ubatch_n_seq_id.resize(n_ubatch);
22-
ubatch_seq_id.resize(n_ubatch);
23-
ubatch_output.resize(n_ubatch);
18+
19+
udatas.push_back({});
20+
21+
auto & udata = udatas.back();
22+
23+
udata.token.resize(!has_embd ? n_ubatch : 0);
24+
udata.embd.resize(has_embd ? n_embd * n_ubatch : 0);
25+
udata.pos.resize(n_ubatch);
26+
udata.n_seq_id.resize(n_ubatch);
27+
udata.seq_id.resize(n_ubatch);
28+
udata.output.resize(n_ubatch);
29+
2430
llama_ubatch ubatch = {
2531
/*equal_seqs =*/ true,
2632
/*n_tokens =*/ 0,
2733
/*n_seq_tokens =*/ 0,
2834
/*n_seqs =*/ 0,
29-
/*token =*/ !has_embd ? ubatch_token.data() : nullptr,
30-
/*embd =*/ has_embd ? ubatch_embd.data() : nullptr,
31-
/*pos =*/ ubatch_pos.data(),
32-
/*n_seq_id =*/ ubatch_n_seq_id.data(),
33-
/*seq_id =*/ ubatch_seq_id.data(),
34-
/*output =*/ ubatch_output.data(),
35+
/*token =*/ !has_embd ? udata.token.data() : nullptr,
36+
/*embd =*/ has_embd ? udata.embd.data() : nullptr,
37+
/*pos =*/ udata.pos.data(),
38+
/*n_seq_id =*/ udata.n_seq_id.data(),
39+
/*seq_id =*/ udata.seq_id.data(),
40+
/*output =*/ udata.output.data(),
3541
};
42+
3643
return ubatch;
3744
}
3845

examples/talk-llama/llama-batch.h

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@ struct llama_ubatch {
1111
bool equal_seqs;
1212
// TODO: whole_seqs for embeddings?
1313

14-
uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
14+
uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
1515
uint32_t n_seq_tokens; // tokens per sequence
1616
uint32_t n_seqs;
1717

1818
llama_token * token; // [n_tokens]
1919
float * embd; // [n_embd, n_tokens]
2020
llama_pos * pos; // [n_tokens]
21-
int32_t * n_seq_id; // [n_seqs]
22-
llama_seq_id ** seq_id; // [n_seqs]
21+
int32_t * n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence
22+
llama_seq_id ** seq_id; // [n_seqs] // TODO: become llama_seq_id * seq_id;
2323
int8_t * output; // [n_tokens]
2424
};
2525

@@ -49,13 +49,18 @@ struct llama_sbatch {
4949

5050
const llama_batch * batch = nullptr;
5151

52-
// buffers for the ubatch
53-
std::vector<llama_token> ubatch_token;
54-
std::vector<float> ubatch_embd;
55-
std::vector<llama_pos> ubatch_pos;
56-
std::vector<int32_t> ubatch_n_seq_id;
57-
std::vector<llama_seq_id *> ubatch_seq_id;
58-
std::vector<int8_t> ubatch_output;
52+
// buffers for the ubatches
53+
// TODO: very hacky, this needs a complete rework
54+
struct ubatch_data {
55+
std::vector<llama_token> token;
56+
std::vector<float> embd;
57+
std::vector<llama_pos> pos;
58+
std::vector<int32_t> n_seq_id;
59+
std::vector<llama_seq_id *> seq_id;
60+
std::vector<int8_t> output;
61+
};
62+
63+
std::vector<ubatch_data> udatas;
5964

6065
llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false);
6166

0 commit comments

Comments
 (0)