Skip to content

Commit 6f16c45

Browse files
authored
Merge pull request axolotl-ai-cloud#276 from theobjectivedad/logging_enhancement
Logging update: added PID and formatting
2 parents 0bd09c0 + b1f4f7a commit 6f16c45

14 files changed

+124
-79
lines changed

scripts/alpaca_json_to_jsonl.py

+3
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
JsonToJsonlConverter,
1616
StdoutWriter,
1717
)
18+
from axolotl.logging_config import configure_logging
19+
20+
configure_logging()
1821

1922
# add src to the pythonpath so we don't need to pip install this
2023
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))

scripts/finetune.py

+17-14
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from optimum.bettertransformer import BetterTransformer
1818
from transformers import GenerationConfig, TextStreamer
1919

20+
from axolotl.logging_config import configure_logging
2021
from axolotl.utils.data import load_prepare_datasets, load_pretraining_dataset
2122
from axolotl.utils.dict import DictDefault
2223
from axolotl.utils.models import load_model, load_tokenizer
@@ -29,8 +30,10 @@
2930
src_dir = os.path.join(project_root, "src")
3031
sys.path.insert(0, src_dir)
3132

33+
configure_logging()
34+
LOG = logging.getLogger("axolotl.scripts")
35+
3236

33-
logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
3437
DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
3538

3639

@@ -212,7 +215,7 @@ def train(
212215

213216
# load the tokenizer first
214217
tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
215-
logging.info(f"loading tokenizer... {tokenizer_config}")
218+
LOG.info(f"loading tokenizer... {tokenizer_config}")
216219
tokenizer = load_tokenizer(tokenizer_config, cfg.tokenizer_type, cfg)
217220

218221
if (
@@ -234,7 +237,7 @@ def train(
234237
eval_dataset = None
235238

236239
if cfg.debug or "debug" in kwargs:
237-
logging.info("check_dataset_labels...")
240+
LOG.info("check_dataset_labels...")
238241
check_dataset_labels(
239242
train_dataset.select(
240243
[random.randrange(0, len(train_dataset) - 1) for _ in range(5)] # nosec
@@ -243,11 +246,11 @@ def train(
243246
)
244247

245248
if prepare_ds_only:
246-
logging.info("Finished preparing dataset. Exiting...")
249+
LOG.info("Finished preparing dataset. Exiting...")
247250
return
248251

249252
# Load the model and tokenizer
250-
logging.info("loading model and peft_config...")
253+
LOG.info("loading model and peft_config...")
251254
model, peft_config = load_model(
252255
cfg.base_model,
253256
cfg.base_model_config,
@@ -258,17 +261,17 @@ def train(
258261
)
259262

260263
if "merge_lora" in kwargs and cfg.adapter is not None:
261-
logging.info("running merge of LoRA with base model")
264+
LOG.info("running merge of LoRA with base model")
262265
model = model.merge_and_unload()
263266
model.to(dtype=torch.float16)
264267

265268
if cfg.local_rank == 0:
266-
logging.info("saving merged model")
269+
LOG.info("saving merged model")
267270
model.save_pretrained(str(Path(cfg.output_dir) / "merged"))
268271
return
269272

270273
if cfg.inference:
271-
logging.info("calling do_inference function")
274+
LOG.info("calling do_inference function")
272275
prompter: Optional[str] = "AlpacaPrompter"
273276
if "prompter" in kwargs:
274277
if kwargs["prompter"] == "None":
@@ -287,12 +290,12 @@ def train(
287290
model.config.use_cache = False
288291

289292
if torch.__version__ >= "2" and sys.platform != "win32":
290-
logging.info("Compiling torch model")
293+
LOG.info("Compiling torch model")
291294
model = torch.compile(model)
292295

293296
# go ahead and presave, so we have the adapter config available to inspect
294297
if peft_config:
295-
logging.info(f"Pre-saving adapter config to {cfg.output_dir}")
298+
LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
296299
peft_config.save_pretrained(cfg.output_dir)
297300

298301
# In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
@@ -308,9 +311,9 @@ def terminate_handler(_, __, model):
308311
signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
309312
)
310313

311-
logging.info("Starting trainer...")
314+
LOG.info("Starting trainer...")
312315
if cfg.group_by_length:
313-
logging.info("hang tight... sorting dataset for group_by_length")
316+
LOG.info("hang tight... sorting dataset for group_by_length")
314317
resume_from_checkpoint = cfg.resume_from_checkpoint
315318
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
316319
possible_checkpoints = [
@@ -322,7 +325,7 @@ def terminate_handler(_, __, model):
322325
key=lambda path: int(path.split("-")[-1]),
323326
)
324327
resume_from_checkpoint = sorted_paths[-1]
325-
logging.info(
328+
LOG.info(
326329
f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
327330
)
328331

@@ -336,7 +339,7 @@ def terminate_handler(_, __, model):
336339
else:
337340
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
338341

339-
logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
342+
LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
340343

341344
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
342345
# only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file

src/axolotl/datasets.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
# let's check to ensure we don't truncate an item in the middle, we'll use
1515
# the collators later on to pad the datasets
1616

17+
LOG = logging.getLogger("axolotl")
18+
1719

1820
class TokenizedPromptDataset(IterableDataset):
1921
"""
@@ -115,7 +117,7 @@ def __iter__(self):
115117
"attention_mask": attention_mask,
116118
}
117119
else:
118-
logging.warning(
120+
LOG.warning(
119121
f"dropping batch due to tensor size mismatch input_ids: {input_ids.size()}, labels: {labels.size()}, attention_mask: {attention_mask.size()}"
120122
)
121123
buffer = {

src/axolotl/logging_config.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
"""Logging configuration settings"""
2+
3+
import os
4+
import sys
5+
from logging.config import dictConfig
6+
from typing import Any, Dict
7+
8+
DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
9+
"version": 1,
10+
"formatters": {
11+
"simple": {
12+
"format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
13+
},
14+
},
15+
"filters": {},
16+
"handlers": {
17+
"console": {
18+
"class": "logging.StreamHandler",
19+
"formatter": "simple",
20+
"filters": [],
21+
"stream": sys.stdout,
22+
},
23+
},
24+
"root": {"handlers": ["console"], "level": os.getenv("LOG_LEVEL", "INFO")},
25+
}
26+
27+
28+
def configure_logging():
29+
"""Configure with default logging"""
30+
dictConfig(DEFAULT_LOGGING_CONFIG)

src/axolotl/monkeypatch/llama_landmark_attn.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
replace_return_docstrings,
5454
)
5555

56-
logger = logging.get_logger(__name__)
56+
LOG = logging.getLogger("axolotl")
5757

5858
_CONFIG_FOR_DOC = "LlamaConfig"
5959

@@ -862,7 +862,7 @@ def forward(
862862

863863
if self.gradient_checkpointing and self.training:
864864
if use_cache:
865-
logger.warning_once(
865+
LOG.warning_once(
866866
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
867867
)
868868
use_cache = False

src/axolotl/prompt_strategies/pygmalion.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
tokenize_prompt_default,
1212
)
1313

14+
LOG = logging.getLogger("axolotl")
15+
1416
IGNORE_TOKEN_ID = -100
1517

1618

@@ -64,7 +66,7 @@ def tokenize_prompt(self, prompt):
6466
*copy.deepcopy(res["input_ids"])
6567
][len(self.bot_prefix_token_ids) :]
6668
else:
67-
logging.warning(f"unknown role in conversation: {role}")
69+
LOG.warning(f"unknown role in conversation: {role}")
6870
res = defaultdict(lambda: [])
6971

7072
# pylint: disable=duplicate-code

src/axolotl/prompt_tokenizers.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
from axolotl.prompters import IGNORE_TOKEN_ID
1212

13+
LOG = logging.getLogger("axolotl")
14+
1315
IGNORE_INDEX = -100
1416
LLAMA_DEFAULT_PAD_TOKEN = "[PAD]" # nosec
1517
LLAMA_DEFAULT_EOS_TOKEN = "</s>" # nosec
@@ -384,7 +386,7 @@ def tokenize_prompt(self, prompt):
384386
# everything from this is masked out from the labels
385387
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
386388
else:
387-
logging.warning(f"unhandled role: {part[0]}")
389+
LOG.warning(f"unhandled role: {part[0]}")
388390

389391
# pylint: disable=duplicate-code
390392
result, current_len = parse_tokenized_to_result(

src/axolotl/prompters.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from enum import Enum, auto
66
from typing import Generator, List, Optional, Tuple, Union
77

8+
LOG = logging.getLogger("axolotl")
89
IGNORE_TOKEN_ID = -100
910

1011

@@ -241,7 +242,7 @@ def get_prompt(self) -> Generator[Tuple[str, str], None, None]:
241242
if message:
242243
yield (role + ":", " " + message)
243244
else:
244-
logging.warning(f"role with empty message: {role}")
245+
LOG.warning(f"role with empty message: {role}")
245246
yield (role + ":", "")
246247

247248
def copy(self):

src/axolotl/utils/data.py

+20-24
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
SummarizeTLDRPrompter,
3636
)
3737

38+
LOG = logging.getLogger("axolotl")
39+
3840

3941
def load_tokenized_prepared_datasets(
4042
tokenizer, cfg, default_dataset_prepared_path
@@ -73,17 +75,17 @@ def load_tokenized_prepared_datasets(
7375
if dataset:
7476
...
7577
elif any(prepared_ds_path.glob("*")):
76-
logging.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
78+
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
7779
dataset = load_from_disk(str(prepared_ds_path))
78-
logging.info("Prepared dataset loaded from disk...")
80+
LOG.info("Prepared dataset loaded from disk...")
7981
else:
80-
logging.info(f"Unable to find prepared dataset in {prepared_ds_path}")
81-
logging.info("Loading raw datasets...")
82+
LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
83+
LOG.info("Loading raw datasets...")
8284

8385
if cfg.seed:
8486
seed = cfg.seed
8587
else:
86-
logging.info("No seed provided, using default seed of 42")
88+
LOG.info("No seed provided, using default seed of 42")
8789
seed = 42
8890

8991
datasets = []
@@ -255,25 +257,21 @@ def load_tokenized_prepared_datasets(
255257
suffix = ""
256258
if ":load_" in d.type:
257259
suffix = f" Did you mean {d.type.replace(':load_', '.load_')}?"
258-
logging.error(
259-
f"unhandled prompt tokenization strategy: {d.type}. {suffix}"
260-
)
260+
LOG.error(f"unhandled prompt tokenization strategy: {d.type}. {suffix}")
261261
raise ValueError(
262262
f"unhandled prompt tokenization strategy: {d.type} {suffix}"
263263
)
264-
logging.info("tokenizing, merging, and shuffling master dataset")
264+
LOG.info("tokenizing, merging, and shuffling master dataset")
265265

266266
samples: List[int] = []
267267
for d in datasets:
268268
samples = samples + list(d)
269269
dataset = Dataset.from_list(samples).shuffle(seed=seed)
270270
if cfg.local_rank == 0:
271-
logging.info(
272-
f"Saving merged prepared dataset to disk... {prepared_ds_path}"
273-
)
271+
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
274272
dataset.save_to_disk(prepared_ds_path)
275273
if cfg.push_dataset_to_hub:
276-
logging.info(
274+
LOG.info(
277275
f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
278276
)
279277
dataset.push_to_hub(
@@ -324,7 +322,7 @@ def load_prepare_datasets(
324322
use_auth_token = cfg.hf_use_auth_token
325323
try:
326324
if cfg.push_dataset_to_hub:
327-
logging.info(
325+
LOG.info(
328326
f"Checking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
329327
)
330328
dataset = load_dataset(
@@ -338,13 +336,13 @@ def load_prepare_datasets(
338336
if dataset:
339337
...
340338
elif any(prepared_ds_path.glob("*")):
341-
logging.info(
339+
LOG.info(
342340
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
343341
)
344342
dataset = load_from_disk(str(prepared_ds_path))
345-
logging.info("Prepared packed dataset loaded from disk...")
343+
LOG.info("Prepared packed dataset loaded from disk...")
346344
if cfg.push_dataset_to_hub:
347-
logging.info(
345+
LOG.info(
348346
f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
349347
)
350348
dataset.push_to_hub(
@@ -363,9 +361,7 @@ def load_prepare_datasets(
363361
[dataset],
364362
seq_length=max_packed_sequence_len,
365363
)
366-
logging.info(
367-
f"packing master dataset to len: {cfg.max_packed_sequence_len}"
368-
)
364+
LOG.info(f"packing master dataset to len: {cfg.max_packed_sequence_len}")
369365
dataset = Dataset.from_list(list(constant_len_dataset))
370366

371367
# filter out bad data
@@ -381,12 +377,12 @@ def load_prepare_datasets(
381377
)
382378

383379
if cfg.local_rank == 0:
384-
logging.info(
380+
LOG.info(
385381
f"Saving packed prepared dataset to disk... {prepared_ds_path}"
386382
)
387383
dataset.save_to_disk(prepared_ds_path)
388384
if cfg.push_dataset_to_hub:
389-
logging.info(
385+
LOG.info(
390386
f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
391387
)
392388
dataset.push_to_hub(
@@ -399,7 +395,7 @@ def load_prepare_datasets(
399395
)
400396

401397
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
402-
logging.info(
398+
LOG.info(
403399
f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
404400
)
405401
dataset = dataset.shard(
@@ -520,7 +516,7 @@ def encode_pretraining(tokenizer, max_tokens, examples):
520516
"attention_mask": [seq.tolist() for seq in new_attention_mask],
521517
}
522518

523-
logging.debug(len(ret["input_ids"]))
519+
LOG.debug(len(ret["input_ids"]))
524520
return ret
525521

526522

0 commit comments

Comments
 (0)