16
16
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
17
17
# DEALINGS IN THE SOFTWARE.
18
18
19
+ import argparse
19
20
import asyncio
21
+ import datetime as dt
20
22
import math
21
23
import os
22
24
import random
23
25
import typing
24
26
25
-
26
- import wandb
27
+ import bittensor as bt
27
28
import torch
28
-
29
- import argparse
30
- import constants
31
-
29
+ import wandb
30
+ from dotenv import load_dotenv
32
31
from taoverse .metagraph import utils as metagraph_utils
33
32
from taoverse .model .storage .chain .chain_model_metadata_store import (
34
33
ChainModelMetadataStore ,
37
36
HuggingFaceModelStore ,
38
37
)
39
38
from taoverse .model .storage .model_metadata_store import ModelMetadataStore
39
+ from taoverse .utilities import logging
40
+ from taoverse .utilities import utils as taoverse_utils
40
41
from taoverse .utilities .enum_action import IntEnumAction
41
- from competitions .data import CompetitionId
42
-
43
- import pretrain as pt
44
- import bittensor as bt
45
42
from transformers import PreTrainedModel
46
- import datetime as dt
47
43
48
- from dotenv import load_dotenv
44
+ import constants
45
+ import pretrain as pt
46
+ from competitions .data import CompetitionId
49
47
50
48
load_dotenv () # take environment variables from .env.
51
49
@@ -132,9 +130,7 @@ def get_config():
132
130
parser .add_argument (
133
131
"--bs" , type = int , default = constants .batch_size , help = "Batch size"
134
132
)
135
- parser .add_argument (
136
- "--sl" , type = int , default = constants .SEQUENCE_LENGTH_2 , help = "Sequence length"
137
- )
133
+ parser .add_argument ("--sl" , type = int , default = 4096 , help = "Sequence length" )
138
134
parser .add_argument (
139
135
"--accumulation_steps" ,
140
136
type = int ,
@@ -149,7 +145,7 @@ def get_config():
149
145
)
150
146
parser .add_argument (
151
147
"--netuid" ,
152
- type = str ,
148
+ type = int ,
153
149
default = constants .SUBNET_UID ,
154
150
help = "The subnet UID." ,
155
151
)
@@ -196,7 +192,7 @@ async def load_starting_model(
196
192
metagraph = metagraph ,
197
193
metadata_store = metadata_store ,
198
194
)
199
- bt . logging .success (
195
+ logging .info (
200
196
f"Training with best model from competition: { config .competition_id } . Model={ str (model )} "
201
197
)
202
198
return model
@@ -210,33 +206,37 @@ async def load_starting_model(
210
206
metagraph = metagraph ,
211
207
metadata_store = metadata_store ,
212
208
)
213
- bt . logging .success (
209
+ logging .info (
214
210
f"Training with model from uid: { config .load_uid } . Model={ str (model )} "
215
211
)
216
212
return model
217
213
218
214
# Check if we should load a model from a local directory.
219
215
if config .load_model_dir :
220
216
model = pt .mining .load_local_model (config .load_model_dir , kwargs )
221
- bt . logging .success (f"Training with model from disk. Model={ str (model )} " )
217
+ logging .info (f"Training with model from disk. Model={ str (model )} " )
222
218
return model
223
219
224
220
# Check if we should load a model from a local file.
225
221
if config .load_model :
226
222
model = pt .mining .load_gpt2_model (config .load_model )
227
- bt . logging .success (f"Training with model from disk. Model={ str (model )} " )
223
+ logging .info (f"Training with model from disk. Model={ str (model )} " )
228
224
return model
229
225
230
226
# Start from scratch.
231
227
model = pt .model .get_model ()
232
- bt . logging .success (f"Training from scratch. Model={ str (model )} " )
228
+ logging .info (f"Training from scratch. Model={ str (model )} " )
233
229
234
230
return model
235
231
236
232
237
233
async def main (config : bt .config ):
234
+ raise NotImplementedError ("You must implement your own training logic in miner.py" )
235
+
238
236
# Create bittensor objects.
239
- bt .logging (config = config )
237
+ bt .logging .set_warning ()
238
+ taoverse_utils .logging .reinitialize ()
239
+ taoverse_utils .configure_logging (config )
240
240
241
241
wallet = bt .wallet (config = config )
242
242
subtensor = bt .subtensor (config = config )
@@ -250,7 +250,7 @@ async def main(config: bt.config):
250
250
# If running online, make sure the miner is registered, has a hugging face access token, and has provided a repo id.
251
251
my_uid = None
252
252
if not config .offline :
253
- my_uid = meta_utils .assert_registered (wallet , metagraph )
253
+ my_uid = metagraph_utils .assert_registered (wallet , metagraph )
254
254
HuggingFaceModelStore .assert_access_token_exists ()
255
255
256
256
# Create a unique run id for this run.
@@ -261,7 +261,7 @@ async def main(config: bt.config):
261
261
use_wandb = False
262
262
if not config .offline :
263
263
if config .wandb_project is None or config .wandb_entity is None :
264
- bt . logging .warning (
264
+ logging .warning (
265
265
"Wandb project or entity not specified. This run will not be logged to wandb"
266
266
)
267
267
else :
@@ -273,17 +273,17 @@ async def main(config: bt.config):
273
273
274
274
if not model_constraints :
275
275
raise RuntimeError (f"No competition found for { config .competition_id } " )
276
-
276
+
277
277
kwargs = model_constraints .kwargs .copy ()
278
-
278
+
279
279
# Init model.
280
280
# Init model.
281
281
tokenizer = pt .model .load_tokenizer (model_constraints , cache_dir = config .model_dir )
282
282
model = await load_starting_model (config , metagraph , chain_metadata_store , kwargs )
283
283
model = model .train ()
284
284
model = model .to (config .device )
285
285
286
- bt . logging .success (f"Saving model to path: { model_dir } ." )
286
+ logging .info (f"Saving model to path: { model_dir } ." )
287
287
pt .mining .save (model , model_dir )
288
288
289
289
# Build optimizer
@@ -308,7 +308,7 @@ async def main(config: bt.config):
308
308
"uid" : my_uid ,
309
309
"hotkey" : wallet .hotkey .ss58_address ,
310
310
"run_name" : run_id ,
311
- "version" : constants .__version__ ,
311
+ "version" : constants .__version__ ,
312
312
"type" : "miner" ,
313
313
},
314
314
allow_val_change = True ,
@@ -318,7 +318,7 @@ async def main(config: bt.config):
318
318
# This is not seen by validators.
319
319
wandb_run .save (os .path .join (model_dir , "*" ), base_path = model_dir , policy = "end" )
320
320
else :
321
- bt . logging .warning (
321
+ logging .warning (
322
322
"Not posting run to wandb. Either --offline is specified or the wandb settings are missing."
323
323
)
324
324
@@ -335,7 +335,7 @@ async def main(config: bt.config):
335
335
epoch_loss = 0.0
336
336
337
337
# Prepare the data loader with random pages for each epoch
338
- bt . logging .success (
338
+ logging .info (
339
339
f"Loading { config .pages_per_epoch } pages for training this epoch"
340
340
)
341
341
random_pages = [
@@ -346,7 +346,7 @@ async def main(config: bt.config):
346
346
# Change this loader if you wish to use a different dataset
347
347
loader = pt .dataset .SubsetFineWebEdu2Loader (
348
348
batch_size = config .bs ,
349
- sequence_length = config .sl
349
+ sequence_length = config .sl ,
350
350
num_pages = config .pages_per_epoch ,
351
351
tokenizer = tokenizer ,
352
352
)
@@ -369,7 +369,7 @@ async def main(config: bt.config):
369
369
n_acc_steps += 1
370
370
optimizer .step () # Perform a single optimization step
371
371
optimizer .zero_grad () # Clear gradients
372
- bt . logging .success (
372
+ logging .info (
373
373
f"Step: { n_acc_steps } loss: { outputs .loss .detach ().item ()} "
374
374
)
375
375
if use_wandb :
@@ -388,47 +388,46 @@ async def main(config: bt.config):
388
388
avg_loss = epoch_loss / n_batches
389
389
390
390
# Log the average loss for the epoch
391
- bt . logging .success (f"Epoch: { epoch_step } average loss: { avg_loss } " )
391
+ logging .info (f"Epoch: { epoch_step } average loss: { avg_loss } " )
392
392
epoch_step += 1
393
393
394
394
# Check if the average loss of this epoch is the best we've seen so far
395
395
if avg_loss < best_avg_loss :
396
396
best_avg_loss = avg_loss # Update the best average loss
397
397
398
- bt . logging .success (f"New best average loss: { best_avg_loss } ." )
398
+ logging .info (f"New best average loss: { best_avg_loss } ." )
399
399
400
400
# Save the model to your mining dir.
401
- bt . logging .success (f"Saving model to path: { model_dir } ." )
401
+ logging .info (f"Saving model to path: { model_dir } ." )
402
402
pt .mining .save (model , model_dir )
403
403
404
- bt . logging .success ("Finished training" )
404
+ logging .info ("Finished training" )
405
405
# Push the model to your run.
406
406
if not config .offline :
407
407
if best_avg_loss < config .avg_loss_upload_threshold :
408
- bt . logging .success (
408
+ logging .info (
409
409
f"Trained model had a best_avg_loss of { best_avg_loss } which is below the threshold of { config .avg_loss_upload_threshold } . Uploading to hugging face. "
410
410
)
411
411
412
412
# First, reload the best model from the training run.
413
413
model_to_upload = pt .mining .load_local_model (
414
414
model_dir , model_constraints .kwargs
415
415
)
416
-
416
+
417
417
await pt .mining .push (
418
418
model_to_upload ,
419
419
config .hf_repo_id ,
420
- wallet ,
420
+ wallet ,
421
421
config .competition_id ,
422
422
metadata_store = chain_metadata_store ,
423
- use_hotkey_in_hash = config .use_hotkey_in_hash ,
424
423
)
425
-
424
+
426
425
else :
427
- bt . logging .success (
426
+ logging .info (
428
427
f"This training run achieved a best_avg_loss={ best_avg_loss } , which did not meet the upload threshold. Not uploading to hugging face."
429
428
)
430
429
else :
431
- bt . logging .success (
430
+ logging .info (
432
431
"Not uploading to hugging face because --offline was specified."
433
432
)
434
433
@@ -440,7 +439,7 @@ async def main(config: bt.config):
440
439
441
440
if __name__ == "__main__" :
442
441
# Parse and print configuration
443
- config = neuron_config . miner_config ()
442
+ config = get_config ()
444
443
445
444
if config .list_competitions :
446
445
print (constants .COMPETITION_SCHEDULE_BY_BLOCK )
0 commit comments