@@ -37,6 +37,8 @@ int msB_log256(int x)
37
37
const int block_header_size = 2 ;
38
38
const int fixed_token_cost = 1 ;
39
39
40
+ int total_pad = 0 ;
41
+
40
42
std::vector<uint8_t > encode (llama_context *ctx, std::vector<llama_token> inp, gpt_sampler *smpl, int num_raw_tokens_header)
41
43
{
42
44
@@ -62,7 +64,6 @@ std::vector<uint8_t> encode(llama_context *ctx, std::vector<llama_token> inp, gp
62
64
for (int index = num_raw_tokens_header; index < inp.size (); index++)
63
65
{
64
66
auto &cur_p = smpl->cur_p ; // initialized by set_logits
65
- // llama_sampler_apply(smpl->grmr, &cur_p);
66
67
llama_sampler_apply (smpl->chain , &cur_p);
67
68
68
69
int match = -1 ;
@@ -121,12 +122,10 @@ std::vector<uint8_t> encode(llama_context *ctx, std::vector<llama_token> inp, gp
121
122
int sample_id = sample_ids[i];
122
123
uint8_t PAD = (8 - bit_offset % 8 ) % 8 ;
123
124
uint8_t bytesize = (uint8_t )msB_log256 (sample_id);
124
- // LOG("pos: %d, bs: %d\n",sample_id, bytesize);
125
125
126
126
// Big number, better save as token
127
127
if (sample_id > PAD + (block_header_size + fixed_token_cost + bytesize) * 8 )
128
128
{
129
- // LOG("End block\n");
130
129
// Close current block (0b1010 is block marker)
131
130
if (was_block)
132
131
{
@@ -151,21 +150,18 @@ std::vector<uint8_t> encode(llama_context *ctx, std::vector<llama_token> inp, gp
151
150
}
152
151
}
153
152
bit_offset += PAD;
153
+ total_pad += PAD;
154
154
if (bit_offset % 8 )
155
155
{
156
156
LOG_ERR (" Unreachable" );
157
157
exit (-1 );
158
158
}
159
- // LOG("\n%d",bit_offset/8);
160
159
// 0b0101 is token marker
161
-
162
160
sample_ids_bitpacked.push_back (0b01010000 | bytesize);
163
161
// put token bytes into sample_ids_bitpacked
164
- // LOG("\n%d -> ",sample_id);
165
162
for (int j = 0 ; j < bytesize; j++)
166
163
{
167
164
sample_ids_bitpacked.push_back (sample_id & 0xff );
168
- LOG (" %02x " , sample_id & 0xff );
169
165
sample_id >>= 8 ;
170
166
}
171
167
if (sample_id)
@@ -217,6 +213,7 @@ std::vector<uint8_t> encode(llama_context *ctx, std::vector<llama_token> inp, gp
217
213
int block_size = (bit_offset + PAD) / 8 - block_start;
218
214
// endianness: big endian
219
215
sample_ids_bitpacked[block_start + 1 ] = block_size & 0xff ;
216
+ total_pad+=PAD;
220
217
}
221
218
llama_batch_free (batch);
222
219
return sample_ids_bitpacked;
@@ -245,7 +242,6 @@ std::vector<llama_token> decode(llama_context *ctx, gpt_sampler *smpl, std::vect
245
242
auto token_str = llama_token_to_piece (ctx, token);
246
243
LOG (" %s" , token_str.c_str ());
247
244
}
248
- LOG (" \u001b [0m\u001b [37m" );
249
245
if (llama_decode (ctx, batch))
250
246
{
251
247
LOG_ERR (" %s: llama_decode() failed\n " , __func__);
@@ -275,6 +271,7 @@ std::vector<llama_token> decode(llama_context *ctx, gpt_sampler *smpl, std::vect
275
271
276
272
auto &cur_p = smpl->cur_p ; // initialized by set_logits
277
273
llama_sampler_apply (smpl->chain , &cur_p);
274
+
278
275
auto token_id = cur_p.data [sample_id].id ;
279
276
280
277
out.push_back (token_id);
@@ -288,12 +285,10 @@ std::vector<llama_token> decode(llama_context *ctx, gpt_sampler *smpl, std::vect
288
285
// print in red
289
286
LOG (" \u001b [31m%s" , llama_token_to_piece (ctx, token_id).c_str ());
290
287
LOG (" \n Expected: %s" , llama_token_to_piece (ctx, inp[num_raw_tokens_header + index]).c_str ());
291
- // LOG("\n%d", num_raw_tokens_header + index);
292
288
LOG (" \n , Id: %d != %d" , token_id, inp[num_raw_tokens_header + index]);
293
289
LOG (" \n Pos: %d, bs:%d" , sample_id, bytesize);
294
290
295
291
// print sample_id bytes in hex
296
- // LOG("\n %02x %02x", sample_ids_bitpacked[bit_index / 8], sample_ids_bitpacked[bit_index / 8 + 1]);
297
292
LOG (" \n " );
298
293
for (int i = bytesize; i > 0 ; i--)
299
294
{
@@ -335,8 +330,8 @@ std::vector<llama_token> decode(llama_context *ctx, gpt_sampler *smpl, std::vect
335
330
int sample_id = id;
336
331
337
332
auto &cur_p = smpl->cur_p ; // initialized by set_logits
338
- // llama_sampler_apply(smpl->grmr, &cur_p);
339
333
llama_sampler_apply (smpl->chain , &cur_p);
334
+
340
335
auto token_id = cur_p.data [sample_id].id ;
341
336
out.push_back (token_id);
342
337
if (!inp.size () || token_id == inp[num_raw_tokens_header + index])
@@ -363,7 +358,6 @@ std::vector<llama_token> decode(llama_context *ctx, gpt_sampler *smpl, std::vect
363
358
id = 0 ;
364
359
}
365
360
}
366
- // LOG("\n(%d+%d)/8= %d\n",bit_index,PAD,(bit_index+PAD)/8);
367
361
bit_index += PAD;
368
362
}
369
363
}
@@ -554,10 +548,12 @@ int main(int argc, char **argv)
554
548
if (!params.no_perf ){
555
549
LOG (" \n Input: %d characters (%d tokens)" , params.prompt .length (), inp.size ());
556
550
557
- float compressed_byte_per_token = (float )sample_ids_bitpacked.size () / (float )inp.size ();
551
+ float compressed_bits_per_token = 8 * (float )sample_ids_bitpacked.size () / (float )inp.size ();
558
552
float compressed_bits_per_char = 8 * (float )sample_ids_bitpacked.size () / (float )params.prompt .length ();
559
553
560
- LOG (" \n %d compressed bytes,(%04f bytes per token, %04f bits per character)\n " , (int )sample_ids_bitpacked.size (), compressed_byte_per_token, compressed_bits_per_char);
554
+ LOG (" \n %d compressed bytes,(%04f bits per token, %04f bits per character)\n " , (int )sample_ids_bitpacked.size (), compressed_bits_per_token, compressed_bits_per_char);
555
+ LOG (" \n %d padding bits, (%04f bits per character without padding)" , total_pad, compressed_bits_per_char - total_pad/(float )params.prompt .length ());
556
+ LOG (" \n PPL (over)estimation: %04f (%04f with padding)" , exp2 (compressed_bits_per_token-total_pad/(float )inp.size ()),exp2 (compressed_bits_per_token));
561
557
}
562
558
// maybe this needs to be changed
563
559
if (params.out_file != " imatrix.dat" ){
@@ -630,7 +626,7 @@ int main(int argc, char **argv)
630
626
ofs.write ((char *)&out_str[0 ], out_str.size ());
631
627
ofs.close ();
632
628
}
633
-
629
+
634
630
llama_free (ctx);
635
631
llama_free_model (model);
636
632
0 commit comments