Skip to content

Commit 7651550

Browse files
authored
make sure to download fixtures for kd test (axolotl-ai-cloud#2541)
* make sure to download fixtures for kd test * use same alpaca dataset
1 parent 341e95a commit 7651550

File tree

6 files changed

+36
-4
lines changed

6 files changed

+36
-4
lines changed

tests/conftest.py

+26
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,14 @@ def download_tiny_shakespeare_dataset():
193193
snapshot_download_w_retry("winglian/tiny-shakespeare", repo_type="dataset")
194194

195195

196+
@pytest.fixture(scope="session", autouse=True)
197+
def download_evolkit_kd_sample_dataset():
198+
# download the dataset
199+
snapshot_download_w_retry(
200+
"axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample", repo_type="dataset"
201+
)
202+
203+
196204
@pytest.fixture(scope="session", autouse=True)
197205
def download_deepseek_model_fixture():
198206
snapshot_download_w_retry("axolotl-ai-co/DeepSeek-V3-11M", repo_type="model")
@@ -208,6 +216,16 @@ def download_huggyllama_model_fixture():
208216
)
209217

210218

219+
@pytest.fixture(scope="session", autouse=True)
220+
def download_llama33_70b_model_fixture():
221+
# download the tokenizer only
222+
snapshot_download_w_retry(
223+
"axolotl-ai-co/Llama-3.3-70B-Instruct-tokenizer",
224+
repo_type="model",
225+
allow_patterns=["*token*", "config.json"],
226+
)
227+
228+
211229
@pytest.fixture(scope="session", autouse=True)
212230
def download_llama_1b_model_fixture():
213231
# download the tokenizer only
@@ -315,6 +333,14 @@ def download_llama2_model_fixture():
315333
)
316334

317335

336+
@pytest.fixture(scope="session", autouse=True)
337+
def download_llama32_1b_model_fixture():
338+
snapshot_download_w_retry(
339+
"osllmai-community/Llama-3.2-1B",
340+
repo_type="model",
341+
)
342+
343+
318344
@pytest.fixture
319345
@enable_hf_offline
320346
def tokenizer_huggyllama(

tests/e2e/multigpu/solo/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Tests under this directory should get run "solo" on their own as they
2+
# seem to cause issues when run in the same batch as other tests.

tests/e2e/multigpu/solo/test_flex.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,9 @@ def test_loss_llama(self, temp_dir):
4949
},
5050
"datasets": [
5151
{
52-
"path": "vicgalle/alpaca-gpt4",
52+
"path": "tatsu-lab/alpaca",
5353
"type": "alpaca",
54+
"split": "train[:10%]",
5455
},
5556
],
5657
"num_epochs": 1,

tests/e2e/patched/test_resume.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,9 @@ def test_resume_lora_packed(self, temp_dir):
4646
},
4747
"datasets": [
4848
{
49-
"path": "vicgalle/alpaca-gpt4",
49+
"path": "tatsu-lab/alpaca",
5050
"type": "alpaca",
51+
"split": "train[:10%]",
5152
},
5253
],
5354
"num_epochs": 2,

tests/e2e/solo/test_flex.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,9 @@ def test_loss_llama(self, temp_dir):
4141
},
4242
"datasets": [
4343
{
44-
"path": "vicgalle/alpaca-gpt4",
44+
"path": "tatsu-lab/alpaca",
4545
"type": "alpaca",
46+
"split": "train[:10%]",
4647
},
4748
],
4849
"num_epochs": 1,

tests/e2e/test_packing_loss.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,9 @@ def test_loss_packed(self, temp_dir):
4040
},
4141
"datasets": [
4242
{
43-
"path": "vicgalle/alpaca-gpt4",
43+
"path": "tatsu-lab/alpaca",
4444
"type": "alpaca",
45+
"split": "train[:10%]",
4546
},
4647
],
4748
"num_epochs": 1,

0 commit comments

Comments
 (0)