11
11
12
12
import pytest
13
13
import requests
14
+ from datasets import load_dataset
14
15
from huggingface_hub import snapshot_download
15
- from utils import disable_hf_offline
16
+ from transformers import AutoTokenizer
17
+ from utils import disable_hf_offline , enable_hf_offline
16
18
17
19
18
20
def retry_on_request_exceptions (max_retries = 3 , delay = 1 ):
@@ -46,7 +48,6 @@ def snapshot_download_w_retry(*args, **kwargs):
46
48
47
49
48
50
@pytest .fixture (scope = "session" , autouse = True )
49
- @disable_hf_offline
50
51
def download_smollm2_135m_model ():
51
52
# download the model
52
53
snapshot_download_w_retry ("HuggingFaceTB/SmolLM2-135M" , repo_type = "model" )
@@ -59,28 +60,24 @@ def download_llama_68m_random_model():
59
60
60
61
61
62
@pytest .fixture (scope = "session" , autouse = True )
62
- @disable_hf_offline
63
63
def download_qwen_2_5_half_billion_model ():
64
64
# download the model
65
65
snapshot_download_w_retry ("Qwen/Qwen2.5-0.5B" , repo_type = "model" )
66
66
67
67
68
68
@pytest .fixture (scope = "session" , autouse = True )
69
- @disable_hf_offline
70
69
def download_tatsu_lab_alpaca_dataset ():
71
70
# download the dataset
72
71
snapshot_download_w_retry ("tatsu-lab/alpaca" , repo_type = "dataset" )
73
72
74
73
75
74
@pytest .fixture (scope = "session" , autouse = True )
76
- @disable_hf_offline
77
75
def download_mhenrichsen_alpaca_2k_dataset ():
78
76
# download the dataset
79
77
snapshot_download_w_retry ("mhenrichsen/alpaca_2k_test" , repo_type = "dataset" )
80
78
81
79
82
80
@pytest .fixture (scope = "session" , autouse = True )
83
- @disable_hf_offline
84
81
def download_mhenrichsen_alpaca_2k_w_revision_dataset ():
85
82
# download the dataset
86
83
snapshot_download_w_retry (
@@ -89,7 +86,6 @@ def download_mhenrichsen_alpaca_2k_w_revision_dataset():
89
86
90
87
91
88
@pytest .fixture (scope = "session" , autouse = True )
92
- @disable_hf_offline
93
89
def download_mlabonne_finetome_100k_dataset ():
94
90
# download the dataset
95
91
snapshot_download_w_retry ("mlabonne/FineTome-100k" , repo_type = "dataset" )
@@ -124,6 +120,24 @@ def download_fozzie_alpaca_dpo_dataset():
124
120
)
125
121
126
122
123
+ @pytest .fixture (scope = "session" )
124
+ @disable_hf_offline
125
+ def dataset_fozzie_alpaca_dpo_dataset (
126
+ download_fozzie_alpaca_dpo_dataset ,
127
+ ): # pylint: disable=unused-argument,redefined-outer-name
128
+ return load_dataset ("fozziethebeat/alpaca_messages_2k_dpo_test" , split = "train" )
129
+
130
+
131
+ @pytest .fixture (scope = "session" )
132
+ @disable_hf_offline
133
+ def dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff (
134
+ download_fozzie_alpaca_dpo_dataset ,
135
+ ): # pylint: disable=unused-argument,redefined-outer-name
136
+ return load_dataset (
137
+ "fozziethebeat/alpaca_messages_2k_dpo_test" , split = "train" , revision = "ea82cff"
138
+ )
139
+
140
+
127
141
@pytest .fixture (scope = "session" , autouse = True )
128
142
def download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset ():
129
143
# download the dataset
@@ -152,7 +166,6 @@ def download_deepseek_model_fixture():
152
166
153
167
154
168
@pytest .fixture (scope = "session" , autouse = True )
155
- @disable_hf_offline
156
169
def download_huggyllama_model_fixture ():
157
170
# download the tokenizer only
158
171
snapshot_download_w_retry (
@@ -163,7 +176,6 @@ def download_huggyllama_model_fixture():
163
176
164
177
165
178
@pytest .fixture (scope = "session" , autouse = True )
166
- @disable_hf_offline
167
179
def download_llama_1b_model_fixture ():
168
180
# download the tokenizer only
169
181
snapshot_download_w_retry (
@@ -174,7 +186,6 @@ def download_llama_1b_model_fixture():
174
186
175
187
176
188
@pytest .fixture (scope = "session" , autouse = True )
177
- @disable_hf_offline
178
189
def download_llama3_8b_model_fixture ():
179
190
# download the tokenizer only
180
191
snapshot_download_w_retry (
@@ -183,7 +194,6 @@ def download_llama3_8b_model_fixture():
183
194
184
195
185
196
@pytest .fixture (scope = "session" , autouse = True )
186
- @disable_hf_offline
187
197
def download_llama3_8b_instruct_model_fixture ():
188
198
# download the tokenizer only
189
199
snapshot_download_w_retry (
@@ -194,7 +204,6 @@ def download_llama3_8b_instruct_model_fixture():
194
204
195
205
196
206
@pytest .fixture (scope = "session" , autouse = True )
197
- @disable_hf_offline
198
207
def download_phi_35_mini_model_fixture ():
199
208
# download the tokenizer only
200
209
snapshot_download_w_retry (
@@ -263,6 +272,17 @@ def download_llama2_model_fixture():
263
272
)
264
273
265
274
275
+ @pytest .fixture (scope = "session" , autouse = True )
276
+ @enable_hf_offline
277
+ def tokenizer_huggyllama (
278
+ download_huggyllama_model_fixture ,
279
+ ): # pylint: disable=unused-argument,redefined-outer-name
280
+ tokenizer = AutoTokenizer .from_pretrained ("huggyllama/llama-7b" )
281
+ tokenizer .pad_token = "</s>"
282
+
283
+ return tokenizer
284
+
285
+
266
286
@pytest .fixture
267
287
def temp_dir ():
268
288
# Create a temporary directory
0 commit comments