Better documentation

cg123 · cg123 · commit d6e2bcf2d362 · 2025-03-20T15:03:51.000-07:00
diff --git a/mergekit/graph.py b/mergekit/graph.py
@@ -91,23 +91,55 @@ def uses_accelerator(self) -> bool:
         """
         Returns True if the task can take advantage of matrix operation
         acceleration (such as on a GPU).
+
+        Tasks that perform heavy matrix operations should return True here
+        so they can be scheduled on appropriate devices.
+
+        Returns:
+            bool: True if the task benefits from acceleration, False otherwise
         """
         return False
 
     def main_thread_only(self) -> bool:
         """
         Returns True if the task should only be executed on the main thread.
+
+        Tasks with side effects like file I/O or that require specific thread
+        context should return True here to avoid parallel execution issues.
+
+        Returns:
+            bool: True if the task must run on the main thread, False otherwise
         """
         return False
 
     def duplicate_per_gpu(self) -> bool:
         """
         Returns True if the task should be duplicated for each GPU.
+
+        Tasks that are faster to execute than to transfer between devices
+        or are common dependencies of otherwise independent tasks should
+        return True here to maximize parallelism.
+
+        Returns:
+            bool: True if the task should be duplicated per GPU, False otherwise
         """
         return False
 
 
 class TaskUniverse:
+    """
+    Container for tasks and their relationships.
+
+    Maintains a registry of tasks and their dependencies, allowing efficient
+    lookup and traversal of the task graph.
+
+    Attributes:
+        tasks: List of all tasks in this universe
+        task_to_index: Mapping from task instances to their indices
+        task_arguments: Mapping from task indices to their argument dependencies
+        _type_id_to_index: Quick lookup for seen task instances
+    """
+
     tasks: List[Task]
     task_to_index: Dict[Task, int]
     task_arguments: Dict[int, Dict[str, int]]
@@ -123,6 +155,18 @@ def __init__(self, tasks: Optional[Iterable[Task]] = None):
                 self.add_task(task)
 
     def add_task(self, task: Task, recursive: bool = True) -> "TaskHandle":
+        """
+        Add a task to the universe and return a handle to it.
+
+        If the task already exists in the universe, returns a handle to the existing instance.
+
+        Args:
+            task: The task to add
+            recursive: If True, also add all dependent tasks recursively
+
+        Returns:
+            TaskHandle: A handle to the added task
+        """
         _ti_key = (type(task), id(task))
         if _ti_key in self._type_id_to_index:
             index = self._type_id_to_index[_ti_key]
@@ -144,30 +188,80 @@ def add_task(self, task: Task, recursive: bool = True) -> "TaskHandle":
         return TaskHandle(self, index)
 
     def get_handle(self, task: Task) -> Optional["TaskHandle"]:
+        """
+        Get a TaskHandle for an existing task, if it exists in this universe.
+
+        Args:
+            task: The task to look up
+
+        Returns:
+            Optional[TaskHandle]: A handle to the task, or None if not found
+        """
         if task not in self.task_to_index:
             return None
         return TaskHandle(self, self.task_to_index[task])
 
 
 class TaskHandle:
+    """
+    A reference to a task within a specific TaskUniverse.
+
+    TaskHandle provides a lightweight way to refer to tasks without directly
+    holding the task instances themselves. Particularly useful for putting
+    tasks in sets or as keys in dictionaries. Much faster to compare and hash
+    than full Task instances.
+
+    Attributes:
+        _universe: The TaskUniverse containing the referenced task
+        _index: The index of the task within the universe
+    """
+
     __slots__ = ["_universe", "_index"]
     _universe: TaskUniverse
     _index: int
 
     def __init__(self, universe: TaskUniverse, index: int):
+        """
+        Initialize a TaskHandle.
+
+        Args:
+            universe: The TaskUniverse containing the task
+            index: The index of the task within the universe
+        """
         self._universe = universe
         self._index = index
 
     def task(self) -> Task:
+        """
+        Get the actual Task instance referenced by this handle.
+
+        Returns:
+            Task: The referenced task
+        """
         return self._universe.tasks[self._index]
 
     def arguments(self) -> Dict[str, "TaskHandle"]:
+        """
+        Get handles to all argument tasks (dependencies) of this task.
+
+        Returns:
+            Dict[str, TaskHandle]: Mapping from argument names to task handles
+        """
         return {
             k: TaskHandle(self._universe, v)
             for k, v in self._universe.task_arguments[self._index].items()
         }
 
     def __eq__(self, other):
+        """
+        Check if two TaskHandles refer to the same task in the same universe.
+
+        Args:
+            other: Another object to compare with
+
+        Returns:
+            bool: True if equal, False otherwise
+        """
         if not isinstance(other, TaskHandle):
             return False
         if self._index != other._index:
@@ -180,21 +274,53 @@ def __hash__(self):
         return self._index
 
     def __str__(self):
-        return f"TaskHandle({self._index})"
+        return f"TaskHandle({type(self.task()).__name__}, {self._index})"
+
+    __repr__ = __str__
 
 
 class ExecutionSchedule:
+    """
+    Represents an ordered schedule of tasks for execution and their lifecycle information.
+
+    Tracks when each task's result can be discarded to optimize memory usage.
+
+    Attributes:
+        tasks: Ordered list of tasks to execute
+        last_use_index: Maps each task to the index in the schedule where its result is last used
+    """
+
     tasks: List[TaskHandle]
     last_use_index: Dict[TaskHandle, int]
 
     def __init__(self, tasks: List[TaskHandle], last_use_index: Dict[TaskHandle, int]):
+        """
+        Initialize an execution schedule.
+
+        Args:
+            tasks: Ordered list of tasks to execute
+            last_use_index: Dictionary mapping tasks to their last use index in the schedule
+        """
         self.tasks = tasks
         self.last_use_index = last_use_index
 
 
 def build_schedule(
     targets: List[TaskHandle], cached_values: Dict[TaskHandle, Any]
 ) -> ExecutionSchedule:
+    """
+    Build an execution schedule for the given target tasks.
+
+    Creates a topologically sorted schedule that respects task dependencies and
+    tracks when each task's result can be discarded to optimize memory usage.
+
+    Args:
+        targets: List of target tasks that need to be executed
+        cached_values: Dictionary of task results that are already available
+
+    Returns:
+        ExecutionSchedule: A schedule containing tasks to execute and their lifecycle info
+    """
     if not targets:
         return ExecutionSchedule(tasks=[], last_use_index={})
 
@@ -241,6 +367,7 @@ def _compare_key(node: TaskHandle) -> Tuple[str, int]:
         if (node != dummy_handle) and node not in (cached_values or {})
     ]
 
+    # Calculate last use indices for memory optimization
     last_use_index = {}
     for idx, task in reversed(list(enumerate(schedule))):
         for dep in task.arguments().values():
diff --git a/mergekit/multigpu_executor.py b/mergekit/multigpu_executor.py
@@ -34,7 +34,17 @@
 
 class MultiGPUExecutor:
     """
-    Execute tasks across multiple GPUs.
+    Execute computational tasks in parallel across multiple GPUs.
+
+    This class analyzes the dependency structure of a task graph and distributes
+    the workload across available GPUs while respecting:
+    1. Tasks requiring main thread execution
+    2. Tasks that need to be duplicated on each GPU
+    3. Task dependencies and data locality
+    4. Memory management for intermediate results
+
+    It automatically partitions the task graph into leading tasks (main thread, pre-GPU),
+    parallel tasks (distributed across GPUs), and trailing tasks (main thread, post-GPU).
 
     Attributes:
         num_gpus: Number of GPUs to utilize (None = all available)
@@ -49,12 +59,18 @@ def __init__(
         storage_device: Optional[torch.device] = None,
     ):
         """
-        Initialize the executor with a list of tasks.
+        Initialize the executor with a list of target tasks.
+
+        This performs initial task graph analysis, including:
+        - Finding tasks that must run on the main thread before parallel execution
+        - Finding tasks that must run on the main thread after parallel execution
+        - Partitioning parallel tasks into islands that can run independently
+        - Assigning islands to GPUs using a load-balancing approach
 
         Args:
-            tasks: List of tasks to execute
+            targets: List of final target tasks to execute
             num_gpus: Number of GPUs to utilize (None = all available)
-            storage_device: Device for storing tensors between stages
+            storage_device: Device for storing intermediate results between execution stages
         """
         self.results: Dict[TaskHandle, Any] = {}
         self.storage_device = storage_device
@@ -191,9 +207,19 @@ def _find_trailing_tasks(self, tasks: List[TaskHandle]) -> Set[TaskHandle]:
         """
         Identify tasks that must execute AFTER parallel GPU tasks complete.
 
-        Trailing tasks must:
-        - Require main thread execution
-        - Not have non-trailing dependants
+        This method finds tasks that need to run after parallel execution because they
+        require the main thread and have dependencies on other tasks.
+
+        A task is considered "trailing" if:
+        - It requires main thread execution (task.main_thread_only() is True)
+        - All tasks dependent on it are also trailing tasks (recursive condition)
+        - OR it has no dependents (terminal task)
+
+        Args:
+            tasks: List of task handles to analyze
+
+        Returns:
+            Set[TaskHandle]: Set of tasks that should be executed after parallel processing
         """
         dependants = defaultdict(set)
         for task_idx, arg_indices in self.universe.task_arguments.items():
@@ -215,11 +241,21 @@ def _find_trailing_tasks(self, tasks: List[TaskHandle]) -> Set[TaskHandle]:
         return trailing_tasks
 
     def _find_leading_tasks(self, tasks: List[TaskHandle]) -> Set[TaskHandle]:
-        """Identify tasks that must execute BEFORE parallel GPU tasks.
+        """
+        Identify tasks that must execute BEFORE parallel GPU tasks.
+
+        This method finds tasks that need to run before parallel execution because they
+        require the main thread and are dependencies for other tasks.
 
-        Leading tasks must:
-        - Require main thread execution
-        - Not have non-leading dependencies
+        A task is considered "leading" if:
+        - It requires main thread execution (task.main_thread_only() is True)
+        - It has no dependencies, or all its dependencies are also leading tasks
+
+        Args:
+            tasks: List of task handles to analyze
+
+        Returns:
+            Set[TaskHandle]: Set of tasks that should be executed before parallel processing
         """
         leading_tasks = set()
         for task_handle in tasks:
@@ -236,11 +272,22 @@ def _assign_islands_to_gpus(
         self, tasks: List[TaskHandle], num_gpus: int
     ) -> Dict[torch.device, List[TaskHandle]]:
         """
-        Assign task islands to GPUs.
+        Assign task islands to GPUs for parallel execution.
+
+        This method partitions the parallel task graph into independent subgraphs
+        (islands) that can be executed independently on different GPUs. It uses
+        a load-balancing approach to distribute islands across available GPUs.
 
-        Task islands (weakly connected components) are groups of tasks that
-        can execute independently. This method identifies islands in the
-        non-trailing, non-leading task graph and assigns them to devices.
+        Task islands are identified as weakly connected components in the task
+        dependency graph, meaning groups of tasks that are connected through
+        dependencies but don't have dependencies outside their group.
+
+        Args:
+            tasks: List of parallel tasks to assign to GPUs
+            num_gpus: Number of available GPUs
+
+        Returns:
+            Dict[torch.device, List[TaskHandle]]: Mapping from GPU devices to assigned tasks
         """
         task_set = set(tasks)
 
@@ -262,7 +309,7 @@ def _assign_islands_to_gpus(
                 continue
             # don't need to sort, inner executor will handle
             island_tasks = [TaskHandle(self.universe, idx) for idx in island]
-            # assign to GPU with fewest tasks
+            # assign to GPU with fewest tasks (load balancing)
             device_idx = min(
                 range(num_gpus),
                 key=lambda i: len(assignments.get(torch.device(f"cuda:{i}"), [])),
@@ -281,11 +328,15 @@ def _device_worker(
         """
         Execute a set of tasks on a single GPU.
 
+        This method runs as a thread worker for a specific GPU. It creates an execution
+        stream on the assigned GPU, runs the tasks, and queues results back to the main
+        thread. Only results needed for target tasks or trailing tasks are retained.
+
         Args:
-            island_tasks: List of tasks to execute
+            task_list: List of tasks to execute on this GPU
             cached_values: Values of previously-executed dependent tasks
-            device: Device to execute tasks on
-            quiet: Suppress progress bar output
+            device: GPU device to execute tasks on
+            quiet: Whether to suppress progress bar output
         """
         LOG.debug(f"Device {device} starting")
         with torch.device(device):
@@ -300,6 +351,7 @@ def _device_worker(
                 count = 0
                 for task_handle, result in exec._run(quiet=quiet):
                     count += 1
+                    # Only keep results needed for target tasks or trailing tasks
                     if not (
                         task_handle in self.targets
                         or task_handle in self.trailing_dependencies