Skip to content

Commit 4dcf1b4

Browse files
authored
Merge pull request #657 from macrocosm-os/staging
v2.17.8
2 parents bec13a6 + 9342f4e commit 4dcf1b4

40 files changed

+3560
-770
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,4 @@ wandb
184184
**/api_keys.json
185185
weights.csv
186186
past_websites.csv
187+
timer_logs*

neurons/miners/epistula_miner/miner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ def run(self):
220220
async def run_inference(self, request: Request) -> str:
221221
data = await request.json()
222222
try:
223-
response = self.llm.generate(
223+
response = await self.llm.generate(
224224
data.get("messages"), sampling_params=data.get("sampling_parameters"), seed=data.get("seed")
225225
)
226226
return response

neurons/validator.py

+103-34
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import asyncio
2-
import multiprocessing as mp
32
import sys
43

54
import loguru
65
import netaddr
76
import requests
87
import torch
8+
9+
# import multiprocessing as mp
10+
import torch.multiprocessing as mp
911
import wandb
1012
from bittensor.core.extrinsics.serving import serve_extrinsic
1113

@@ -29,40 +31,29 @@
2931
NEURON_SAMPLE_SIZE = 100 # TODO: Should add this to constants.py
3032

3133

32-
def create_loop_process(task_queue, scoring_queue, reward_events):
34+
def create_loop_process(task_queue, scoring_queue, reward_events, miners_dict):
3335
settings.shared_settings = settings.SharedSettings.load(mode="validator")
3436
if settings.shared_settings.WANDB_ON:
3537
init_wandb(neuron="validator")
3638

37-
async def spawn_loops(task_queue, scoring_queue, reward_events):
39+
async def spawn_loops(task_queue, scoring_queue, reward_events, miners_dict):
3840
# ruff: noqa: E402
3941
from prompting.llms.model_manager import model_scheduler
40-
from prompting.miner_availability.miner_availability import availability_checking_loop
42+
43+
# from prompting.miner_availability.miner_availability import availability_checking_loop
4144
from prompting.tasks.task_creation import task_loop
42-
from prompting.tasks.task_sending import task_sender
43-
from prompting.weight_setting.weight_setter import weight_setter
4445
from shared.profiling import profiler
4546

4647
logger.info("Starting Profiler...")
4748
asyncio.create_task(profiler.print_stats(), name="Profiler"),
4849

49-
# -------- Duplicate of create_task_loop ----------
50-
logger.info("Starting AvailabilityCheckingLoop...")
51-
asyncio.create_task(availability_checking_loop.start())
52-
53-
logger.info("Starting TaskSender...")
54-
asyncio.create_task(task_sender.start(task_queue, scoring_queue))
55-
5650
logger.info("Starting TaskLoop...")
57-
asyncio.create_task(task_loop.start(task_queue, scoring_queue))
58-
# -------------------------------------------------
51+
asyncio.create_task(task_loop.start(task_queue, scoring_queue, miners_dict, simultaneous_loops=4))
5952

6053
logger.info("Starting ModelScheduler...")
6154
asyncio.create_task(model_scheduler.start(scoring_queue), name="ModelScheduler"),
6255
logger.info("Starting TaskScorer...")
63-
asyncio.create_task(task_scorer.start(scoring_queue, reward_events), name="TaskScorer"),
64-
logger.info("Starting WeightSetter...")
65-
asyncio.create_task(weight_setter.start(reward_events))
56+
asyncio.create_task(task_scorer.start(scoring_queue, reward_events, simultaneous_loops=4), name="TaskScorer"),
6657

6758
while True:
6859
await asyncio.sleep(5)
@@ -73,9 +64,9 @@ async def spawn_loops(task_queue, scoring_queue, reward_events):
7364
logger.debug(f"Number of tasks in Reward Events: {len(reward_events)}")
7465

7566
try:
76-
asyncio.run(spawn_loops(task_queue, scoring_queue, reward_events))
67+
asyncio.run(spawn_loops(task_queue, scoring_queue, reward_events, miners_dict))
7768
except Exception as e:
78-
logger.info(f"Terminating loop process: {e}")
69+
logger.exception(f"Terminating loop process: {e}")
7970
finally:
8071
logger.info("Cleaning up resources...")
8172

@@ -85,16 +76,10 @@ async def spawn_loops(task_queue, scoring_queue, reward_events):
8576
logger.info("WandB run finished.")
8677

8778

88-
def start_api(scoring_queue, reward_events):
79+
def start_api(scoring_queue, reward_events, miners_dict):
8980
async def start():
9081
from prompting.api.api import start_scoring_api # noqa: F401
9182

92-
# TODO: We should not use 2 availability loops for each process, in reality
93-
# we should only be sharing the miner availability data between processes.
94-
from prompting.miner_availability.miner_availability import availability_checking_loop
95-
96-
asyncio.create_task(availability_checking_loop.start())
97-
9883
try:
9984
external_ip = requests.get("https://checkip.amazonaws.com").text.strip()
10085
netaddr.IPAddress(external_ip)
@@ -111,37 +96,121 @@ async def start():
11196
logger.debug(f"Serve success: {serve_success}")
11297
except Exception as e:
11398
logger.warning(f"Failed to serve scoring api to chain: {e}")
114-
await start_scoring_api(task_scorer, scoring_queue, reward_events)
99+
await start_scoring_api(task_scorer, scoring_queue, reward_events, miners_dict)
115100

116101
while True:
117102
await asyncio.sleep(10)
118103

119104
asyncio.run(start())
120105

121106

107+
def start_task_sending_loop(task_queue, scoring_queue, miners_dict: dict):
108+
async def spawn_loops(task_queue, scoring_queue, miners_dict: dict):
109+
from prompting.tasks.task_sending import task_sender
110+
111+
logger.info("Starting task sending loop in validator2...")
112+
asyncio.create_task(task_sender.start(task_queue, scoring_queue, miners_dict, simultaneous_loops=10))
113+
while True:
114+
await asyncio.sleep(5)
115+
logger.debug("Task sending loop is running")
116+
117+
try:
118+
logger.info("Starting task sending loop in validator...")
119+
asyncio.run(spawn_loops(task_queue, scoring_queue, miners_dict))
120+
121+
except Exception as e:
122+
logger.exception(f"Task sending loop error: {e}")
123+
raise
124+
125+
126+
def start_availability_checking_loop(miners_dict: dict):
127+
async def spawn_loops(miners_dict: dict):
128+
from prompting.miner_availability.miner_availability import availability_checking_loop
129+
130+
logger.info("Starting availability checking loop in validator2...")
131+
asyncio.create_task(availability_checking_loop.start(miners_dict))
132+
while True:
133+
await asyncio.sleep(5)
134+
logger.debug("Availability checking loop is running")
135+
136+
try:
137+
logger.info("Starting availability checking loop in validator...")
138+
asyncio.run(spawn_loops(miners_dict))
139+
140+
except Exception as e:
141+
logger.exception(f"Availability checking loop error: {e}")
142+
raise
143+
144+
145+
def start_weight_setter_loop(reward_events):
146+
async def spawn_loops(reward_events):
147+
from prompting.weight_setting.weight_setter import weight_setter
148+
149+
logger.info("Starting weight setter loop in validator2...")
150+
asyncio.create_task(weight_setter.start(reward_events))
151+
while True:
152+
await asyncio.sleep(5)
153+
logger.debug("Weight setter loop is running")
154+
155+
try:
156+
logger.info("Starting weight setter loop in validator...")
157+
asyncio.run(spawn_loops(reward_events))
158+
159+
except Exception as e:
160+
logger.exception(f"Weight setter loop error: {e}")
161+
raise
162+
163+
122164
async def main():
123165
# will start checking the availability of miners at regular intervals, needed for API and Validator
124166
with torch.multiprocessing.Manager() as manager:
125167
reward_events = manager.list()
126168
scoring_queue = manager.list()
127169
task_queue = manager.list()
128-
129-
# Create process pool for managed processes
170+
miners_dict = manager.dict()
130171
processes = []
131172

132173
try:
133-
# # Start checking the availability of miners at regular intervals
174+
# Start checking the availability of miners at regular intervals
134175
if settings.shared_settings.DEPLOY_SCORING_API:
135176
# Use multiprocessing to bypass API blocking issue
136-
api_process = mp.Process(target=start_api, args=(scoring_queue, reward_events), name="API_Process")
177+
api_process = mp.Process(
178+
target=start_api, args=(scoring_queue, reward_events, miners_dict), name="API_Process"
179+
)
137180
api_process.start()
138181
processes.append(api_process)
139182

140-
loop_process = mp.Process(
141-
target=create_loop_process, args=(task_queue, scoring_queue, reward_events), name="LoopProcess"
183+
availability_process = mp.Process(
184+
target=start_availability_checking_loop,
185+
args=(miners_dict,),
186+
name="AvailabilityProcess",
142187
)
188+
availability_process.start()
189+
processes.append(availability_process)
143190

191+
loop_process = mp.Process(
192+
target=create_loop_process,
193+
args=(task_queue, scoring_queue, reward_events, miners_dict),
194+
name="LoopProcess",
195+
)
144196
loop_process.start()
197+
198+
task_sending_process = mp.Process(
199+
target=start_task_sending_loop,
200+
args=(task_queue, scoring_queue, miners_dict),
201+
name="TaskSendingProcess",
202+
)
203+
task_sending_process.start()
204+
processes.append(task_sending_process)
205+
206+
weight_setter_process = mp.Process(
207+
target=start_weight_setter_loop,
208+
args=(reward_events,),
209+
name="WeightSetterProcess",
210+
)
211+
weight_setter_process.start()
212+
processes.append(weight_setter_process)
213+
145214
processes.append(loop_process)
146215
GPUInfo.log_gpu_info()
147216

0 commit comments

Comments
 (0)