Skip to content

Commit 0f35871

Browse files
authored
swap tinymodels that have safetensors for some ci tests (#2641)
1 parent 25e6c5f commit 0f35871

14 files changed

+137
-20
lines changed

.github/workflows/tests-nightly.yml

+87
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,96 @@ jobs:
1818
env:
1919
SKIP: no-commit-to-branch
2020

21+
preload-cache:
22+
name: Preload HF cache
23+
runs-on: ubuntu-latest
24+
strategy:
25+
fail-fast: false
26+
matrix:
27+
python_version: ["3.11"]
28+
pytorch_version: ["2.6.0"]
29+
timeout-minutes: 20
30+
31+
env:
32+
AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
33+
34+
steps:
35+
- name: Check out repository code
36+
uses: actions/checkout@v4
37+
38+
- name: Restore HF cache
39+
id: hf-cache-restore
40+
uses: actions/cache/restore@v4
41+
with:
42+
path: |
43+
/home/runner/.cache/huggingface/hub/datasets--*
44+
/home/runner/.cache/huggingface/hub/models--*
45+
key: ${{ runner.os }}-hf-hub-cache-v2
46+
47+
- name: Setup Python
48+
uses: actions/setup-python@v5
49+
with:
50+
python-version: ${{ matrix.python_version }}
51+
cache: 'pip' # caching pip dependencies
52+
53+
- name: upgrade pip
54+
run: |
55+
pip3 install --upgrade pip
56+
pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
57+
58+
- name: Install PyTorch
59+
run: |
60+
pip3 install torch==${{ matrix.pytorch_version }}
61+
62+
- name: Install dependencies
63+
run: |
64+
pip3 show torch
65+
pip3 install --no-build-isolation -U -e .
66+
python scripts/unsloth_install.py | sh
67+
python scripts/cutcrossentropy_install.py | sh
68+
pip3 install -r requirements-dev.txt -r requirements-tests.txt
69+
70+
- name: Make sure PyTorch version wasn't clobbered
71+
run: |
72+
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
73+
74+
- name: Ensure axolotl CLI was installed
75+
run: |
76+
axolotl --help
77+
78+
- name: Pre-Download dataset fixture
79+
run: |
80+
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
81+
82+
- name: Run tests
83+
run: |
84+
pytest -v tests/conftest.py
85+
86+
- name: Upload coverage to Codecov
87+
uses: codecov/codecov-action@v5
88+
with:
89+
token: ${{ secrets.CODECOV_TOKEN }}
90+
files: ./coverage.xml
91+
flags: unittests,pytorch-${{ matrix.pytorch_version }}
92+
fail_ci_if_error: false
93+
94+
- name: cleanup pip cache
95+
run: |
96+
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
97+
98+
- name: Save HF cache
99+
id: hf-cache
100+
uses: actions/cache/save@v4
101+
with:
102+
path: |
103+
/home/runner/.cache/huggingface/hub/datasets--*
104+
/home/runner/.cache/huggingface/hub/models--*
105+
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
106+
21107
pytest:
22108
name: PyTest
23109
runs-on: ubuntu-latest
110+
needs: [preload-cache]
24111
strategy:
25112
fail-fast: false
26113
max-parallel: 2

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ liger-kernel==0.5.9
1111

1212
packaging==23.2
1313

14+
huggingface_hub==0.31.0
1415
peft==0.15.2
1516
transformers==4.51.3
1617
tokenizers>=0.21.1

src/axolotl/train.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import importlib
44
import inspect
5+
import logging
56
import os
67
import signal
78
import sys
@@ -12,7 +13,6 @@
1213

1314
import torch
1415
import transformers.modelcard
15-
from accelerate.logging import get_logger
1616
from accelerate.utils import save_fsdp_model
1717
from datasets import Dataset
1818
from huggingface_hub.errors import OfflineModeIsEnabled
@@ -42,7 +42,7 @@
4242
except ImportError:
4343
BetterTransformer = None
4444

45-
LOG = get_logger(__name__)
45+
LOG = logging.getLogger(__name__)
4646

4747

4848
def setup_model_and_tokenizer(
@@ -63,7 +63,6 @@ def setup_model_and_tokenizer(
6363
# Load tokenizer
6464
LOG.debug(
6565
f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
66-
main_process_only=True,
6766
)
6867
tokenizer = load_tokenizer(cfg)
6968

src/axolotl/utils/gradient_checkpointing/__init__.py

+21
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,36 @@
11
"""custom checkpointing utils"""
22

3+
import importlib
34
from functools import partial
45

6+
from packaging import version
7+
58
from axolotl.utils.gradient_checkpointing.unsloth import (
69
Unsloth_Offloaded_Gradient_Checkpointer,
710
)
811

12+
transformers_version = version.parse(importlib.metadata.version("transformers"))
13+
if transformers_version > version.parse("4.51.3"):
14+
from transformers.modeling_layers import GradientCheckpointingLayer
15+
16+
def uses_gc_layers(decoder_layer):
17+
return isinstance(decoder_layer.func.__self__, GradientCheckpointingLayer)
18+
19+
else:
20+
21+
def uses_gc_layers(_):
22+
return False
23+
924

1025
def hf_grad_checkpoint_offload_wrapper(
1126
decoder_layer, *args, use_reentrant=None
1227
): # pylint: disable=unused-argument
28+
if uses_gc_layers(decoder_layer):
29+
return Unsloth_Offloaded_Gradient_Checkpointer.apply(
30+
decoder_layer,
31+
*args,
32+
)
33+
1334
return Unsloth_Offloaded_Gradient_Checkpointer.apply(
1435
(
1536
decoder_layer.func.__self__

tests/e2e/multigpu/test_llama.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ def test_fsdp2_packed(
479479
"sample_packing": True,
480480
"pad_to_sequence_len": True,
481481
"sequence_len": 2048,
482-
"val_set_size": 0.05,
482+
"val_set_size": 0.1,
483483
"special_tokens": {
484484
"pad_token": "<|endoftext|>",
485485
},

tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@
2929

3030
MODEL_CONFIGS = [
3131
{
32-
"name": "openaccess-ai-collective/tiny-mistral",
32+
"name": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
3333
"expected_activation": apply_lora_mlp_swiglu,
3434
"dtype": torch.float16,
3535
},
3636
{
37-
"name": "Qwen/Qwen2-7B",
37+
"name": "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
3838
"expected_activation": apply_lora_mlp_swiglu,
3939
"dtype": torch.float16,
4040
},
@@ -44,7 +44,7 @@
4444
"dtype": torch.float32,
4545
},
4646
{
47-
"name": "mhenrichsen/gemma-2b",
47+
"name": "trl-internal-testing/tiny-Gemma2ForCausalLM",
4848
"expected_activation": apply_lora_mlp_geglu,
4949
"dtype": torch.float16,
5050
},
@@ -156,7 +156,9 @@ def test_swiglu_mlp_integration(small_llama_model):
156156
def test_geglu_model_integration():
157157
"""Test GeGLU activation with Gemma model."""
158158
model = AutoModelForCausalLM.from_pretrained(
159-
"mhenrichsen/gemma-2b", torch_dtype=torch.float16, device_map="cuda:0"
159+
"trl-internal-testing/tiny-Gemma2ForCausalLM",
160+
torch_dtype=torch.float16,
161+
device_map="cuda:0",
160162
)
161163
peft_config = get_peft_config(
162164
{

tests/e2e/patched/test_falcon_samplepack.py

+4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import os
77
import unittest
88

9+
import pytest
10+
911
from axolotl.cli.args import TrainerCliArgs
1012
from axolotl.common.datasets import load_datasets
1113
from axolotl.train import train
@@ -23,6 +25,7 @@ class TestFalconPatched(unittest.TestCase):
2325
Test case for Falcon models
2426
"""
2527

28+
@pytest.mark.skip(reason="no tiny models for testing with safetensors")
2629
@with_temp_dir
2730
def test_qlora(self, temp_dir):
2831
# pylint: disable=duplicate-code
@@ -71,6 +74,7 @@ def test_qlora(self, temp_dir):
7174
train(cfg=cfg, dataset_meta=dataset_meta)
7275
check_model_output_exists(temp_dir, cfg)
7376

77+
@pytest.mark.skip(reason="no tiny models for testing with safetensors")
7478
@with_temp_dir
7579
def test_ft(self, temp_dir):
7680
# pylint: disable=duplicate-code

tests/e2e/patched/test_mistral_samplepack.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def test_lora_packing(self, temp_dir):
2828
# pylint: disable=duplicate-code
2929
cfg = DictDefault(
3030
{
31-
"base_model": "openaccess-ai-collective/tiny-mistral",
31+
"base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
3232
"flash_attention": True,
3333
"sample_packing": True,
3434
"sequence_len": 1024,
@@ -76,7 +76,7 @@ def test_ft_packing(self, temp_dir):
7676
# pylint: disable=duplicate-code
7777
cfg = DictDefault(
7878
{
79-
"base_model": "openaccess-ai-collective/tiny-mistral",
79+
"base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
8080
"flash_attention": True,
8181
"sample_packing": True,
8282
"sequence_len": 1024,

tests/e2e/patched/test_model_patches.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def test_mixtral_multipack(self, temp_dir):
5656
def test_mistral_multipack(self, temp_dir):
5757
cfg = DictDefault(
5858
{
59-
"base_model": "openaccess-ai-collective/tiny-mistral",
59+
"base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
6060
"flash_attention": True,
6161
"sample_packing": True,
6262
"sequence_len": 2048,

tests/e2e/patched/test_resume.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from axolotl.utils.config import normalize_config, validate_config
1616
from axolotl.utils.dict import DictDefault
1717

18-
from ..utils import check_model_output_exists, most_recent_subdir
18+
from ..utils import check_model_output_exists, most_recent_subdir, require_torch_2_6_0
1919

2020
LOG = logging.getLogger("axolotl.tests.e2e")
2121
os.environ["WANDB_DISABLED"] = "true"
@@ -26,6 +26,7 @@ class TestResumeLlama:
2626
Test case for resuming training of llama models
2727
"""
2828

29+
@require_torch_2_6_0
2930
def test_resume_lora_packed(self, temp_dir):
3031
# pylint: disable=duplicate-code
3132
cfg = DictDefault(
@@ -62,6 +63,7 @@ def test_resume_lora_packed(self, temp_dir):
6263
"save_total_limit": 5,
6364
"max_steps": 15,
6465
"use_tensorboard": True,
66+
"save_safetensors": True,
6567
}
6668
)
6769
if is_torch_bf16_gpu_available():

tests/e2e/test_evaluate.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,11 @@ def test_evaluate(self, temp_dir):
1919
# pylint: disable=duplicate-code
2020
cfg = DictDefault(
2121
{
22-
"base_model": "JackFram/llama-68m",
23-
"tokenizer_type": "LlamaTokenizer",
22+
"base_model": "HuggingFaceTB/SmolLM2-135M",
2423
"sequence_len": 1024,
2524
"val_set_size": 0.02,
2625
"special_tokens": {
27-
"unk_token": "<unk>",
28-
"bos_token": "<s>",
29-
"eos_token": "</s>",
26+
"pad_token": "<|endoftext|>",
3027
},
3128
"datasets": [
3229
{

tests/e2e/test_falcon.py

+5
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import os
77
import unittest
88

9+
import pytest
10+
911
from axolotl.cli.args import TrainerCliArgs
1012
from axolotl.common.datasets import load_datasets
1113
from axolotl.train import train
@@ -23,6 +25,7 @@ class TestFalcon(unittest.TestCase):
2325
Test case for falcon
2426
"""
2527

28+
@pytest.mark.skip(reason="no tiny models for testing with safetensors")
2629
@with_temp_dir
2730
def test_lora(self, temp_dir):
2831
# pylint: disable=duplicate-code
@@ -74,6 +77,7 @@ def test_lora(self, temp_dir):
7477
train(cfg=cfg, dataset_meta=dataset_meta)
7578
check_model_output_exists(temp_dir, cfg)
7679

80+
@pytest.mark.skip(reason="no tiny models for testing with safetensors")
7781
@with_temp_dir
7882
def test_lora_added_vocab(self, temp_dir):
7983
# pylint: disable=duplicate-code
@@ -129,6 +133,7 @@ def test_lora_added_vocab(self, temp_dir):
129133
train(cfg=cfg, dataset_meta=dataset_meta)
130134
check_model_output_exists(temp_dir, cfg)
131135

136+
@pytest.mark.skip(reason="no tiny models for testing with safetensors")
132137
@with_temp_dir
133138
def test_ft(self, temp_dir):
134139
# pylint: disable=duplicate-code

tests/e2e/test_mistral.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def test_lora(self, temp_dir):
3030
# pylint: disable=duplicate-code
3131
cfg = DictDefault(
3232
{
33-
"base_model": "openaccess-ai-collective/tiny-mistral",
33+
"base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
3434
"flash_attention": True,
3535
"sequence_len": 1024,
3636
"load_in_8bit": True,
@@ -77,7 +77,7 @@ def test_ft(self, temp_dir):
7777
# pylint: disable=duplicate-code
7878
cfg = DictDefault(
7979
{
80-
"base_model": "openaccess-ai-collective/tiny-mistral",
80+
"base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
8181
"flash_attention": True,
8282
"sequence_len": 1024,
8383
"val_set_size": 0.02,

tests/test_datasets.py

-1
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,6 @@ def test_loading_local_dataset_folder(self, tokenizer):
414414
snapshot_path = snapshot_download(
415415
repo_id="mhenrichsen/alpaca_2k_test",
416416
repo_type="dataset",
417-
local_dir=tmp_ds_path,
418417
)
419418
shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True)
420419

0 commit comments

Comments
 (0)