single-cell-data
diff --git a/‎notebooks/tutorial_lightning.ipynb
Lines changed: 27 additions & 35 deletions b/‎notebooks/tutorial_lightning.ipynb
Lines changed: 27 additions & 35 deletions
diff --git a/‎notebooks/tutorial_multiworker.ipynb
Lines changed: 245 additions & 0 deletions b/‎notebooks/tutorial_multiworker.ipynb
Lines changed: 245 additions & 0 deletions
@@ -26,9 +26,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/torchdata/datapipes/__init__.py:18: UserWarning: \n",
+      "################################################################################\n",
+      "WARNING!\n",
+      "The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a\n",
+      "future torchdata release! Please see https://github.com/pytorch/data/issues/1196\n",
+      "to learn more and leave feedback.\n",
+      "################################################################################\n",
+      "\n",
+      "  deprecation_warning()\n"
+     ]
+    }
+   ],
    "source": [
     "import pytorch_lightning as pl\n",
     "import torch\n",
@@ -58,7 +74,6 @@
     "        obs_column_names=[\"cell_type\"],\n",
     "        batch_size=128,\n",
     "        shuffle=True,\n",
-    "        seed=12345,\n",
     "    )"
    ]
   },
@@ -71,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -100,7 +115,6 @@
     "        predictions = torch.argmax(probabilities, axis=1)\n",
     "\n",
     "        # Compute loss\n",
-    "        # y_batch = y_batch.flatten()\n",
     "        y_batch = torch.from_numpy(\n",
     "            self.cell_type_encoder.transform(y_batch[\"cell_type\"])\n",
     "        ).to(self.device)\n",
@@ -130,7 +144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -140,6 +154,7 @@
       "GPU available: True (cuda), used: True\n",
       "TPU available: False, using: 0 TPU cores\n",
       "HPU available: False, using: 0 HPUs\n",
+      "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
       "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
       "\n",
       "  | Name    | Type             | Params | Mode \n",
@@ -153,32 +168,15 @@
       "2.905     Total estimated model params size (MB)\n",
       "2         Modules in train mode\n",
       "0         Modules in eval mode\n",
-      "/home/ubuntu/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/utilities/data.py:122: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.\n",
-      "/home/ubuntu/miniforge3/envs/toymodel/lib/python3.11/site-packages/torchdata/datapipes/__init__.py:18: UserWarning: \n",
-      "################################################################################\n",
-      "WARNING!\n",
-      "The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a\n",
-      "future torchdata release! Please see https://github.com/pytorch/data/issues/1196\n",
-      "to learn more and leave feedback.\n",
-      "################################################################################\n",
-      "\n",
-      "  deprecation_warning()\n",
-      "/home/ubuntu/miniforge3/envs/toymodel/lib/python3.11/site-packages/torchdata/datapipes/__init__.py:18: UserWarning: \n",
-      "################################################################################\n",
-      "WARNING!\n",
-      "The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a\n",
-      "future torchdata release! Please see https://github.com/pytorch/data/issues/1196\n",
-      "to learn more and leave feedback.\n",
-      "################################################################################\n",
-      "\n",
-      "  deprecation_warning()\n"
+      "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.\n",
+      "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/utilities/data.py:122: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Epoch 19: 100%|██████████| 118/118 [00:17<00:00,  6.87it/s, v_num=6, train_loss=1.680, train_accuracy=0.977]"
+      "Epoch 19: 100%|██████████| 118/118 [00:08<00:00, 14.31it/s, v_num=5, train_loss=1.670, train_accuracy=0.977]"
      ]
     },
     {
@@ -192,14 +190,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Epoch 19: 100%|██████████| 118/118 [00:17<00:00,  6.86it/s, v_num=6, train_loss=1.680, train_accuracy=0.977]\n"
+      "Epoch 19: 100%|██████████| 118/118 [00:08<00:00, 14.28it/s, v_num=5, train_loss=1.670, train_accuracy=0.977]\n"
      ]
     }
    ],
    "source": [
-    "dataloader = soma_ml.experiment_dataloader(\n",
-    "    experiment_dataset, num_workers=2, persistent_workers=True\n",
-    ")\n",
+    "dataloader = soma_ml.experiment_dataloader(experiment_dataset)\n",
     "\n",
     "# The size of the input dimension is the number of genes\n",
     "input_dim = experiment_dataset.shape[1]\n",
@@ -213,11 +209,7 @@
     ")\n",
     "\n",
     "# Define the PyTorch Lightning Trainer\n",
-    "trainer = pl.Trainer(\n",
-    "    max_epochs=20,\n",
-    "    # accelerator=args.accelerator,\n",
-    "    # strategy=\"ddp\",\n",
-    ")\n",
+    "trainer = pl.Trainer(max_epochs=20)\n",
     "\n",
     "# set precision\n",
     "torch.set_float32_matmul_precision(\"high\")\n",
 
@@ -0,0 +1,245 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Multi-process training\n",
+    "\n",
+    "Multi-process usage of `tiledbsoma_ml.ExperimentAxisQueryIterDataset` includes both:\n",
+    "* using the `torch.utils.data.DataLoader` with 1 or more worker (ie., with an argument of `n_workers=1` or greater)\n",
+    "* using a multi-process training configuration, such as `DistributedDataParallel`\n",
+    "\n",
+    "In these configurations, `ExperimentAxisQueryIterDataset` will automatically partition data across workers. However, when using `shuffle=True`, there are several things to keep in mind:\n",
+    "\n",
+    "1. All worker processes must share the same random number generator `seed`, ensuring that all workers shuffle and partition the data in the same way.\n",
+    "2. To ensure that each epoch returns a _different_ shuffle, the caller must set the epoch, using the `set_epoch` API. This is identical to the behavior of `torch.utils.data.distributed.DistributedSampler`.\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/torchdata/datapipes/__init__.py:18: UserWarning: \n",
+      "################################################################################\n",
+      "WARNING!\n",
+      "The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a\n",
+      "future torchdata release! Please see https://github.com/pytorch/data/issues/1196\n",
+      "to learn more and leave feedback.\n",
+      "################################################################################\n",
+      "\n",
+      "  deprecation_warning()\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tiledbsoma_ml as soma_ml\n",
+    "import torch\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "\n",
+    "import tiledbsoma as soma\n",
+    "\n",
+    "CZI_Census_Homo_Sapiens_URL = \"s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens/\"\n",
+    "\n",
+    "experiment = soma.open(\n",
+    "    CZI_Census_Homo_Sapiens_URL,\n",
+    "    context=soma.SOMATileDBContext(tiledb_config={\"vfs.s3.region\": \"us-west-2\"}),\n",
+    ")\n",
+    "obs_value_filter = \"tissue_general == 'tongue' and is_primary_data == True\"\n",
+    "\n",
+    "with experiment.axis_query(\n",
+    "    measurement_name=\"RNA\", obs_query=soma.AxisQuery(value_filter=obs_value_filter)\n",
+    ") as query:\n",
+    "    obs_df = query.obs(column_names=[\"cell_type\"]).concat().to_pandas()\n",
+    "    cell_type_encoder = LabelEncoder().fit(obs_df[\"cell_type\"].unique())\n",
+    "\n",
+    "    experiment_dataset = soma_ml.ExperimentAxisQueryIterableDataset(\n",
+    "        query,\n",
+    "        X_name=\"raw\",\n",
+    "        obs_column_names=[\"cell_type\"],\n",
+    "        batch_size=128,\n",
+    "        shuffle=True,\n",
+    "    )\n",
+    "  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "class LogisticRegression(torch.nn.Module):\n",
+    "    def __init__(self, input_dim, output_dim):\n",
+    "        super(LogisticRegression, self).__init__()  # noqa: UP008\n",
+    "        self.linear = torch.nn.Linear(input_dim, output_dim)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        outputs = torch.sigmoid(self.linear(x))\n",
+    "        return outputs\n",
+    "    \n",
+    "\n",
+    "def train_epoch(model, train_dataloader, loss_fn, optimizer, device):\n",
+    "    model.train()\n",
+    "    train_loss = 0\n",
+    "    train_correct = 0\n",
+    "    train_total = 0\n",
+    "\n",
+    "    for X_batch, y_batch in train_dataloader:\n",
+    "        optimizer.zero_grad()\n",
+    "\n",
+    "        X_batch = torch.from_numpy(X_batch).float().to(device)\n",
+    "\n",
+    "        # Perform prediction\n",
+    "        outputs = model(X_batch)\n",
+    "\n",
+    "        # Determine the predicted label\n",
+    "        probabilities = torch.nn.functional.softmax(outputs, 1)\n",
+    "        predictions = torch.argmax(probabilities, axis=1)\n",
+    "\n",
+    "        # Compute the loss and perform back propagation\n",
+    "        y_batch = torch.from_numpy(cell_type_encoder.transform(y_batch['cell_type'])).to(device)\n",
+    "        train_correct += (predictions == y_batch).sum().item()\n",
+    "        train_total += len(predictions)\n",
+    "\n",
+    "        loss = loss_fn(outputs, y_batch.long())\n",
+    "        train_loss += loss.item()\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "\n",
+    "    train_loss /= train_total\n",
+    "    train_accuracy = train_correct / train_total\n",
+    "    return train_loss, train_accuracy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi-worker DataLoader\n",
+    "\n",
+    "If you use a multi-worker data loader (i.e., `num_workers` with a value other than `0`), and `shuffle=True`, remember to call `set_epoch` at the start of each epoch, _before_ the iterator is created.\n",
+    "\n",
+    "The same approach should be taken for parallel training, e.g., when using DDP or DP.\n",
+    "\n",
+    "*Tip*: when running with `num_workers=0`, i.e., using the data loader in-process, the `ExperimentAxisQueryIterDataset` will automatically increment the epoch count each time the iterator completes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "switching torch multiprocessing start method from \"fork\" to \"spawn\"\n",
+      "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/torchdata/datapipes/__init__.py:18: UserWarning: \n",
+      "################################################################################\n",
+      "WARNING!\n",
+      "The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a\n",
+      "future torchdata release! Please see https://github.com/pytorch/data/issues/1196\n",
+      "to learn more and leave feedback.\n",
+      "################################################################################\n",
+      "\n",
+      "  deprecation_warning()\n",
+      "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/torchdata/datapipes/__init__.py:18: UserWarning: \n",
+      "################################################################################\n",
+      "WARNING!\n",
+      "The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a\n",
+      "future torchdata release! Please see https://github.com/pytorch/data/issues/1196\n",
+      "to learn more and leave feedback.\n",
+      "################################################################################\n",
+      "\n",
+      "  deprecation_warning()\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1: Train Loss: 0.0169229 Accuracy 0.3124\n",
+      "Epoch 2: Train Loss: 0.0148674 Accuracy 0.4272\n",
+      "Epoch 3: Train Loss: 0.0144468 Accuracy 0.4509\n",
+      "Epoch 4: Train Loss: 0.0141778 Accuracy 0.4999\n",
+      "Epoch 5: Train Loss: 0.0139660 Accuracy 0.5619\n",
+      "Epoch 6: Train Loss: 0.0137670 Accuracy 0.6971\n",
+      "Epoch 7: Train Loss: 0.0136089 Accuracy 0.8670\n",
+      "Epoch 8: Train Loss: 0.0135203 Accuracy 0.9099\n",
+      "Epoch 9: Train Loss: 0.0134427 Accuracy 0.9262\n",
+      "Epoch 10: Train Loss: 0.0133607 Accuracy 0.9300\n",
+      "Epoch 11: Train Loss: 0.0133110 Accuracy 0.9348\n",
+      "Epoch 12: Train Loss: 0.0132749 Accuracy 0.9378\n",
+      "Epoch 13: Train Loss: 0.0132431 Accuracy 0.9413\n",
+      "Epoch 14: Train Loss: 0.0132194 Accuracy 0.9444\n",
+      "Epoch 15: Train Loss: 0.0131942 Accuracy 0.9465\n",
+      "Epoch 16: Train Loss: 0.0131739 Accuracy 0.9499\n",
+      "Epoch 17: Train Loss: 0.0131527 Accuracy 0.9526\n",
+      "Epoch 18: Train Loss: 0.0131369 Accuracy 0.9551\n",
+      "Epoch 19: Train Loss: 0.0131214 Accuracy 0.9563\n",
+      "Epoch 20: Train Loss: 0.0131061 Accuracy 0.9578\n"
+     ]
+    }
+   ],
+   "source": [
+    "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n",
+    "\n",
+    "# The size of the input dimension is the number of genes\n",
+    "input_dim = experiment_dataset.shape[1]\n",
+    "\n",
+    "# The size of the output dimension is the number of distinct cell_type values\n",
+    "output_dim = len(cell_type_encoder.classes_)\n",
+    "\n",
+    "model = LogisticRegression(input_dim, output_dim).to(device)\n",
+    "loss_fn = torch.nn.CrossEntropyLoss()\n",
+    "optimizer = torch.optim.Adam(model.parameters(), lr=1e-05)\n",
+    "\n",
+    "\n",
+    "# define a two-worker data loader. The dataset is shuffled, so call `set_epoch` to ensure\n",
+    "# that a different shuffle is applied on each epoch.\n",
+    "experiment_dataloader = soma_ml.experiment_dataloader(\n",
+    "    experiment_dataset, num_workers=2, persistent_workers=True\n",
+    ")\n",
+    "\n",
+    "for epoch in range(20):\n",
+    "    experiment_dataset.set_epoch(epoch)\n",
+    "    train_loss, train_accuracy = train_epoch(\n",
+    "        model, experiment_dataloader, loss_fn, optimizer, device\n",
+    "    )\n",
+    "    print(\n",
+    "        f\"Epoch {epoch + 1}: Train Loss: {train_loss:.7f} Accuracy {train_accuracy:.4f}\"\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "toymodel",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}