Skip to content

Commit c496821

Browse files
authored
use offline for precached stream dataset (#2453)
1 parent e46239f commit c496821

File tree

8 files changed

+181
-126
lines changed

8 files changed

+181
-126
lines changed

.github/workflows/tests.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@ jobs:
171171
run: |
172172
axolotl --help
173173
174+
- name: Show HF cache
175+
run: huggingface-cli scan-cache
176+
174177
- name: Run tests
175178
run: |
176179
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/

tests/conftest.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111

1212
import pytest
1313
import requests
14+
from datasets import load_dataset
1415
from huggingface_hub import snapshot_download
15-
from utils import disable_hf_offline
16+
from transformers import AutoTokenizer
17+
from utils import disable_hf_offline, enable_hf_offline
1618

1719

1820
def retry_on_request_exceptions(max_retries=3, delay=1):
@@ -46,7 +48,6 @@ def snapshot_download_w_retry(*args, **kwargs):
4648

4749

4850
@pytest.fixture(scope="session", autouse=True)
49-
@disable_hf_offline
5051
def download_smollm2_135m_model():
5152
# download the model
5253
snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M", repo_type="model")
@@ -59,28 +60,24 @@ def download_llama_68m_random_model():
5960

6061

6162
@pytest.fixture(scope="session", autouse=True)
62-
@disable_hf_offline
6363
def download_qwen_2_5_half_billion_model():
6464
# download the model
6565
snapshot_download_w_retry("Qwen/Qwen2.5-0.5B", repo_type="model")
6666

6767

6868
@pytest.fixture(scope="session", autouse=True)
69-
@disable_hf_offline
7069
def download_tatsu_lab_alpaca_dataset():
7170
# download the dataset
7271
snapshot_download_w_retry("tatsu-lab/alpaca", repo_type="dataset")
7372

7473

7574
@pytest.fixture(scope="session", autouse=True)
76-
@disable_hf_offline
7775
def download_mhenrichsen_alpaca_2k_dataset():
7876
# download the dataset
7977
snapshot_download_w_retry("mhenrichsen/alpaca_2k_test", repo_type="dataset")
8078

8179

8280
@pytest.fixture(scope="session", autouse=True)
83-
@disable_hf_offline
8481
def download_mhenrichsen_alpaca_2k_w_revision_dataset():
8582
# download the dataset
8683
snapshot_download_w_retry(
@@ -89,7 +86,6 @@ def download_mhenrichsen_alpaca_2k_w_revision_dataset():
8986

9087

9188
@pytest.fixture(scope="session", autouse=True)
92-
@disable_hf_offline
9389
def download_mlabonne_finetome_100k_dataset():
9490
# download the dataset
9591
snapshot_download_w_retry("mlabonne/FineTome-100k", repo_type="dataset")
@@ -124,6 +120,24 @@ def download_fozzie_alpaca_dpo_dataset():
124120
)
125121

126122

123+
@pytest.fixture(scope="session")
124+
@disable_hf_offline
125+
def dataset_fozzie_alpaca_dpo_dataset(
126+
download_fozzie_alpaca_dpo_dataset,
127+
): # pylint: disable=unused-argument,redefined-outer-name
128+
return load_dataset("fozziethebeat/alpaca_messages_2k_dpo_test", split="train")
129+
130+
131+
@pytest.fixture(scope="session")
132+
@disable_hf_offline
133+
def dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff(
134+
download_fozzie_alpaca_dpo_dataset,
135+
): # pylint: disable=unused-argument,redefined-outer-name
136+
return load_dataset(
137+
"fozziethebeat/alpaca_messages_2k_dpo_test", split="train", revision="ea82cff"
138+
)
139+
140+
127141
@pytest.fixture(scope="session", autouse=True)
128142
def download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset():
129143
# download the dataset
@@ -152,7 +166,6 @@ def download_deepseek_model_fixture():
152166

153167

154168
@pytest.fixture(scope="session", autouse=True)
155-
@disable_hf_offline
156169
def download_huggyllama_model_fixture():
157170
# download the tokenizer only
158171
snapshot_download_w_retry(
@@ -163,7 +176,6 @@ def download_huggyllama_model_fixture():
163176

164177

165178
@pytest.fixture(scope="session", autouse=True)
166-
@disable_hf_offline
167179
def download_llama_1b_model_fixture():
168180
# download the tokenizer only
169181
snapshot_download_w_retry(
@@ -174,7 +186,6 @@ def download_llama_1b_model_fixture():
174186

175187

176188
@pytest.fixture(scope="session", autouse=True)
177-
@disable_hf_offline
178189
def download_llama3_8b_model_fixture():
179190
# download the tokenizer only
180191
snapshot_download_w_retry(
@@ -183,7 +194,6 @@ def download_llama3_8b_model_fixture():
183194

184195

185196
@pytest.fixture(scope="session", autouse=True)
186-
@disable_hf_offline
187197
def download_llama3_8b_instruct_model_fixture():
188198
# download the tokenizer only
189199
snapshot_download_w_retry(
@@ -194,7 +204,6 @@ def download_llama3_8b_instruct_model_fixture():
194204

195205

196206
@pytest.fixture(scope="session", autouse=True)
197-
@disable_hf_offline
198207
def download_phi_35_mini_model_fixture():
199208
# download the tokenizer only
200209
snapshot_download_w_retry(
@@ -263,6 +272,17 @@ def download_llama2_model_fixture():
263272
)
264273

265274

275+
@pytest.fixture(scope="session", autouse=True)
276+
@enable_hf_offline
277+
def tokenizer_huggyllama(
278+
download_huggyllama_model_fixture,
279+
): # pylint: disable=unused-argument,redefined-outer-name
280+
tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
281+
tokenizer.pad_token = "</s>"
282+
283+
return tokenizer
284+
285+
266286
@pytest.fixture
267287
def temp_dir():
268288
# Create a temporary directory

tests/prompt_strategies/conftest.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,9 @@ def fixture_toolcalling_dataset():
109109

110110
@pytest.fixture(name="llama3_tokenizer", scope="session", autouse=True)
111111
@enable_hf_offline
112-
def fixture_llama3_tokenizer():
112+
def fixture_llama3_tokenizer(
113+
download_llama3_8b_instruct_model_fixture,
114+
): # pylint: disable=unused-argument,redefined-outer-name
113115
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct")
114116

115117
return tokenizer
@@ -123,7 +125,10 @@ def fixture_smollm2_tokenizer():
123125

124126

125127
@pytest.fixture(name="mistralv03_tokenizer", scope="session", autouse=True)
126-
def fixture_mistralv03_tokenizer():
128+
@enable_hf_offline
129+
def fixture_mistralv03_tokenizer(
130+
download_mlx_mistral_7b_model_fixture,
131+
): # pylint: disable=unused-argument,redefined-outer-name
127132
tokenizer = AutoTokenizer.from_pretrained(
128133
"mlx-community/Mistral-7B-Instruct-v0.3-4bit"
129134
)

tests/prompt_strategies/test_chat_templates_advanced.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from datasets import Dataset
1010
from tokenizers import AddedToken
1111
from transformers import PreTrainedTokenizer
12+
from utils import enable_hf_offline
1213

1314
from axolotl.prompt_strategies.chat_template import (
1415
ChatTemplatePrompter,
@@ -101,6 +102,7 @@ def _should_skip_turn(self, tokenizer, turn, turn_idx, start_idx, end_idx):
101102
return True
102103
return False
103104

105+
@enable_hf_offline
104106
def test_train_on_inputs_true(
105107
self,
106108
tokenizer,

0 commit comments

Comments
 (0)