Skip to content

Commit b088a17

Browse files
committed
Whoops missed one
1 parent 8ecf153 commit b088a17

File tree

1 file changed

+10
-10
lines changed

1 file changed

+10
-10
lines changed

mergekit/tokenizer/build.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from mergekit.common import ModelPath, ModelReference, get_config_value
1818
from mergekit.graph import Task
1919

20-
logger = logging.getLogger(__name__)
20+
LOG = logging.getLogger(__name__)
2121

2222

2323
def get_vocab_size(model_path: ModelPath, trust_remote_code: bool) -> Optional[int]:
@@ -30,7 +30,7 @@ def get_vocab_size(model_path: ModelPath, trust_remote_code: bool) -> Optional[i
3030
arch_info = arch_info_for_config(cfg)
3131
return get_config_value(cfg, arch_info.vocab_size_config_key or "vocab_size")
3232
except Exception as e:
33-
logger.warning(f"Unable to get vocab size for {model_path}", exc_info=e)
33+
LOG.warning(f"Unable to get vocab size for {model_path}", exc_info=e)
3434

3535
return None
3636

@@ -120,7 +120,7 @@ def build_union_tokenizer(
120120
vocab = tokenizer.get_vocab()
121121
for tok, idx in vocab.items():
122122
if idx >= vocab_size:
123-
logger.warning(
123+
LOG.warning(
124124
f"Token {repr(tok)} present in {str(model)} tokenizer but >= vocab_size"
125125
)
126126
continue
@@ -138,7 +138,7 @@ def build_union_tokenizer(
138138

139139
if tok in out_added_tokens:
140140
if (out_added_tokens[tok] != info) and tok not in warned_added_tokens:
141-
logger.warning(
141+
LOG.warning(
142142
f"Token '{tok}' added with multiple different settings, using first"
143143
)
144144
warned_added_tokens.add(tok)
@@ -190,7 +190,7 @@ def build_tokenizer(
190190
)
191191

192192
# load all tokenizers
193-
logger.info("Loading tokenizers")
193+
LOG.info("Loading tokenizers")
194194
tokenizers = {base_model: tokenizer_base}
195195
for model in referenced_models:
196196
if model == base_model:
@@ -203,14 +203,14 @@ def build_tokenizer(
203203
trust_remote_code=trust_remote_code,
204204
)
205205
except Exception as e:
206-
logger.error(e)
207-
logger.warning(
206+
LOG.error(e)
207+
LOG.warning(
208208
f"Unable to load tokenizer for {model}. Assuming same as {base_model}."
209209
)
210210
continue
211211
tokenizers[model] = model_tok
212212

213-
logger.info("Building output tokenizer")
213+
LOG.info("Building output tokenizer")
214214
# build final vocabulary
215215
if isinstance(tokenizer_source, ModelReference):
216216
tokenizer_out = transformers.AutoTokenizer.from_pretrained(
@@ -233,7 +233,7 @@ def build_tokenizer(
233233

234234
vocab_out = tokenizer_out.get_vocab()
235235

236-
logger.info("Building permutations")
236+
LOG.info("Building permutations")
237237
permutations = {}
238238
for model in (
239239
pbar := tqdm.tqdm(referenced_models, desc="Building tokenizer permutations")
@@ -256,7 +256,7 @@ def build_tokenizer(
256256

257257
orig_idx = model_vocab[tok]
258258
if orig_idx >= vocab_size:
259-
logger.warning(
259+
LOG.warning(
260260
f"{model} token {repr(tok)} has index {orig_idx}>{vocab_size-1} (padding?)"
261261
)
262262
continue

0 commit comments

Comments
 (0)