17
17
from mergekit .common import ModelPath , ModelReference , get_config_value
18
18
from mergekit .graph import Task
19
19
20
- logger = logging .getLogger (__name__ )
20
+ LOG = logging .getLogger (__name__ )
21
21
22
22
23
23
def get_vocab_size (model_path : ModelPath , trust_remote_code : bool ) -> Optional [int ]:
@@ -30,7 +30,7 @@ def get_vocab_size(model_path: ModelPath, trust_remote_code: bool) -> Optional[i
30
30
arch_info = arch_info_for_config (cfg )
31
31
return get_config_value (cfg , arch_info .vocab_size_config_key or "vocab_size" )
32
32
except Exception as e :
33
- logger .warning (f"Unable to get vocab size for { model_path } " , exc_info = e )
33
+ LOG .warning (f"Unable to get vocab size for { model_path } " , exc_info = e )
34
34
35
35
return None
36
36
@@ -120,7 +120,7 @@ def build_union_tokenizer(
120
120
vocab = tokenizer .get_vocab ()
121
121
for tok , idx in vocab .items ():
122
122
if idx >= vocab_size :
123
- logger .warning (
123
+ LOG .warning (
124
124
f"Token { repr (tok )} present in { str (model )} tokenizer but >= vocab_size"
125
125
)
126
126
continue
@@ -138,7 +138,7 @@ def build_union_tokenizer(
138
138
139
139
if tok in out_added_tokens :
140
140
if (out_added_tokens [tok ] != info ) and tok not in warned_added_tokens :
141
- logger .warning (
141
+ LOG .warning (
142
142
f"Token '{ tok } ' added with multiple different settings, using first"
143
143
)
144
144
warned_added_tokens .add (tok )
@@ -190,7 +190,7 @@ def build_tokenizer(
190
190
)
191
191
192
192
# load all tokenizers
193
- logger .info ("Loading tokenizers" )
193
+ LOG .info ("Loading tokenizers" )
194
194
tokenizers = {base_model : tokenizer_base }
195
195
for model in referenced_models :
196
196
if model == base_model :
@@ -203,14 +203,14 @@ def build_tokenizer(
203
203
trust_remote_code = trust_remote_code ,
204
204
)
205
205
except Exception as e :
206
- logger .error (e )
207
- logger .warning (
206
+ LOG .error (e )
207
+ LOG .warning (
208
208
f"Unable to load tokenizer for { model } . Assuming same as { base_model } ."
209
209
)
210
210
continue
211
211
tokenizers [model ] = model_tok
212
212
213
- logger .info ("Building output tokenizer" )
213
+ LOG .info ("Building output tokenizer" )
214
214
# build final vocabulary
215
215
if isinstance (tokenizer_source , ModelReference ):
216
216
tokenizer_out = transformers .AutoTokenizer .from_pretrained (
@@ -233,7 +233,7 @@ def build_tokenizer(
233
233
234
234
vocab_out = tokenizer_out .get_vocab ()
235
235
236
- logger .info ("Building permutations" )
236
+ LOG .info ("Building permutations" )
237
237
permutations = {}
238
238
for model in (
239
239
pbar := tqdm .tqdm (referenced_models , desc = "Building tokenizer permutations" )
@@ -256,7 +256,7 @@ def build_tokenizer(
256
256
257
257
orig_idx = model_vocab [tok ]
258
258
if orig_idx >= vocab_size :
259
- logger .warning (
259
+ LOG .warning (
260
260
f"{ model } token { repr (tok )} has index { orig_idx } >{ vocab_size - 1 } (padding?)"
261
261
)
262
262
continue
0 commit comments