17
17
from optimum .bettertransformer import BetterTransformer
18
18
from transformers import GenerationConfig , TextStreamer
19
19
20
+ from axolotl .logging_config import configure_logging
20
21
from axolotl .utils .data import load_prepare_datasets , load_pretraining_dataset
21
22
from axolotl .utils .dict import DictDefault
22
23
from axolotl .utils .models import load_model , load_tokenizer
29
30
src_dir = os .path .join (project_root , "src" )
30
31
sys .path .insert (0 , src_dir )
31
32
33
+ configure_logging ()
34
+ LOG = logging .getLogger ("axolotl.scripts" )
35
+
32
36
33
- logging .basicConfig (level = os .getenv ("LOG_LEVEL" , "INFO" ))
34
37
DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
35
38
36
39
@@ -212,7 +215,7 @@ def train(
212
215
213
216
# load the tokenizer first
214
217
tokenizer_config = cfg .tokenizer_config or cfg .base_model_config
215
- logging .info (f"loading tokenizer... { tokenizer_config } " )
218
+ LOG .info (f"loading tokenizer... { tokenizer_config } " )
216
219
tokenizer = load_tokenizer (tokenizer_config , cfg .tokenizer_type , cfg )
217
220
218
221
if (
@@ -234,7 +237,7 @@ def train(
234
237
eval_dataset = None
235
238
236
239
if cfg .debug or "debug" in kwargs :
237
- logging .info ("check_dataset_labels..." )
240
+ LOG .info ("check_dataset_labels..." )
238
241
check_dataset_labels (
239
242
train_dataset .select (
240
243
[random .randrange (0 , len (train_dataset ) - 1 ) for _ in range (5 )] # nosec
@@ -243,11 +246,11 @@ def train(
243
246
)
244
247
245
248
if prepare_ds_only :
246
- logging .info ("Finished preparing dataset. Exiting..." )
249
+ LOG .info ("Finished preparing dataset. Exiting..." )
247
250
return
248
251
249
252
# Load the model and tokenizer
250
- logging .info ("loading model and peft_config..." )
253
+ LOG .info ("loading model and peft_config..." )
251
254
model , peft_config = load_model (
252
255
cfg .base_model ,
253
256
cfg .base_model_config ,
@@ -258,17 +261,17 @@ def train(
258
261
)
259
262
260
263
if "merge_lora" in kwargs and cfg .adapter is not None :
261
- logging .info ("running merge of LoRA with base model" )
264
+ LOG .info ("running merge of LoRA with base model" )
262
265
model = model .merge_and_unload ()
263
266
model .to (dtype = torch .float16 )
264
267
265
268
if cfg .local_rank == 0 :
266
- logging .info ("saving merged model" )
269
+ LOG .info ("saving merged model" )
267
270
model .save_pretrained (str (Path (cfg .output_dir ) / "merged" ))
268
271
return
269
272
270
273
if cfg .inference :
271
- logging .info ("calling do_inference function" )
274
+ LOG .info ("calling do_inference function" )
272
275
prompter : Optional [str ] = "AlpacaPrompter"
273
276
if "prompter" in kwargs :
274
277
if kwargs ["prompter" ] == "None" :
@@ -287,12 +290,12 @@ def train(
287
290
model .config .use_cache = False
288
291
289
292
if torch .__version__ >= "2" and sys .platform != "win32" :
290
- logging .info ("Compiling torch model" )
293
+ LOG .info ("Compiling torch model" )
291
294
model = torch .compile (model )
292
295
293
296
# go ahead and presave, so we have the adapter config available to inspect
294
297
if peft_config :
295
- logging .info (f"Pre-saving adapter config to { cfg .output_dir } " )
298
+ LOG .info (f"Pre-saving adapter config to { cfg .output_dir } " )
296
299
peft_config .save_pretrained (cfg .output_dir )
297
300
298
301
# In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
@@ -308,9 +311,9 @@ def terminate_handler(_, __, model):
308
311
signal .SIGINT , lambda signum , frame : terminate_handler (signum , frame , model )
309
312
)
310
313
311
- logging .info ("Starting trainer..." )
314
+ LOG .info ("Starting trainer..." )
312
315
if cfg .group_by_length :
313
- logging .info ("hang tight... sorting dataset for group_by_length" )
316
+ LOG .info ("hang tight... sorting dataset for group_by_length" )
314
317
resume_from_checkpoint = cfg .resume_from_checkpoint
315
318
if cfg .resume_from_checkpoint is None and cfg .auto_resume_from_checkpoints :
316
319
possible_checkpoints = [
@@ -322,7 +325,7 @@ def terminate_handler(_, __, model):
322
325
key = lambda path : int (path .split ("-" )[- 1 ]),
323
326
)
324
327
resume_from_checkpoint = sorted_paths [- 1 ]
325
- logging .info (
328
+ LOG .info (
326
329
f"Using Auto-resume functionality to start with checkpoint at { resume_from_checkpoint } "
327
330
)
328
331
@@ -336,7 +339,7 @@ def terminate_handler(_, __, model):
336
339
else :
337
340
trainer .train (resume_from_checkpoint = resume_from_checkpoint )
338
341
339
- logging .info (f"Training Completed!!! Saving pre-trained model to { cfg .output_dir } " )
342
+ LOG .info (f"Training Completed!!! Saving pre-trained model to { cfg .output_dir } " )
340
343
341
344
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
342
345
# only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
0 commit comments