Merge pull request #17 from mirsazzathossain/dev

mirsazzathossain · web-flow · commit b69e0efe436c · 2024-09-15T07:19:05.000+06:00
feat(utils): add functions to compute dataset statistics and save HTM…
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,6 +17,7 @@ dependencies = [
     "numpy>=1.24.4",
     "pandas>=2.0.3",
     "pillow>=10.4.0",
+    "torch>=2.4.1",
 ]
 
 [project.urls]
diff --git a/rgc/utils/data.py b/rgc/utils/data.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import pandas as pd
+import torch
 from astropy import units as u
 from astropy.coordinates import SkyCoord
 from astropy.io import fits
@@ -365,3 +366,52 @@ def celestial_capture_bulk(
             series = entry.to_frame().T
             failed = pd.concat([failed, series], ignore_index=True)
             print(f"Failed to capture image. {err}")
+
+
+def dataframe_to_html(catalog: pd.DataFrame, save_dir: str) -> None:
+    """
+    Save the catalog as an HTML file.
+
+    :param catalog: Catalog of the astronomical objects
+    :type catalog: pd.DataFrame
+    :param save_dir: Path to the directory to save the HTML file
+    :type save_dir: str
+    """
+    Path(save_dir).mkdir(parents=True, exist_ok=True)
+    catalog.to_html(os.path.join(save_dir, "catalog.html"))
+
+
+def compute_mean_std(dataloader: torch.utils.data.DataLoader) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compute the mean and standard deviation of the dataset.
+
+    :param dataloader: The dataloader for the dataset.
+    :type dataloader: torch.utils.data.DataLoader
+
+    :return: The mean and standard deviation of the dataset.
+    :rtype: tuple[torch.Tensor, torch.Tensor]
+    """
+    data = torch.tensor([])
+    for batch in dataloader:
+        data = torch.cat((data, batch[0]), 0)
+
+    mean = torch.mean(data, dim=(0, 2, 3))
+    std = torch.std(data, dim=(0, 2, 3))
+
+    return mean, std
+
+
+def remove_artifacts(folder: str, extension: list[str]) -> None:
+    """
+    Remove files with the given extensions from a folder.
+
+    :param folder: Path to the folder to clear
+    :type folder: str
+    :param extension: List of file with the given extensions to keep
+    :type extension: list
+    """
+    for file in os.listdir(folder):
+        if not file.endswith(tuple(extension)):
+            os.remove(os.path.join(folder, file))
+
+    print(f"Artifacts removed from {folder} with extensions {', '.join(extension)}")
diff --git a/tests/test_compute_mean_std.py b/tests/test_compute_mean_std.py
@@ -0,0 +1,28 @@
+import torch
+from torch.utils.data import DataLoader, TensorDataset
+
+from rgc.utils.data import compute_mean_std
+
+
+def test_compute_mean_std():
+    # Create a mock dataset with 3 channels
+    data = torch.tensor([
+        [[[1.0, 2.0], [3.0, 4.0]], [[2.0, 4.0], [6.0, 8.0]], [[0.5, 1.0], [1.5, 2.0]]],  # Batch 1, 3 channels
+        [[[5.0, 6.0], [7.0, 8.0]], [[10.0, 12.0], [14.0, 16.0]], [[2.5, 3.0], [3.5, 4.0]]],  # Batch 2, 3 channels
+        [[[9.0, 10.0], [11.0, 12.0]], [[18.0, 20.0], [22.0, 24.0]], [[4.5, 5.0], [5.5, 6.0]]],  # Batch 3, 3 channels
+    ])
+
+    targets = torch.tensor([0, 1, 2])  # Dummy target labels
+    dataset = TensorDataset(data, targets)
+    dataloader = DataLoader(dataset, batch_size=2)
+
+    # Run the function
+    mean, std = compute_mean_std(dataloader)
+
+    # Expected mean and std for each channel based on the dataset
+    expected_mean = torch.tensor([6.5000, 13.0000, 3.2500])  # Mean across all batches for each channel
+    expected_std = torch.tensor([3.6056, 7.2111, 1.8028])  # Standard deviation across all batches for each channel
+
+    # Check the mean and std are as expected
+    assert torch.allclose(mean, expected_mean, atol=1e-4), f"Expected mean {expected_mean}, but got {mean}"
+    assert torch.allclose(std, expected_std, atol=1e-4), f"Expected std {expected_std}, but got {std}"
diff --git a/tests/test_dataframe_to_html.py b/tests/test_dataframe_to_html.py
@@ -0,0 +1,27 @@
+from unittest.mock import patch
+
+import pandas as pd
+
+from rgc.utils.data import dataframe_to_html
+
+
+@patch("rgc.utils.data.Path.mkdir")
+@patch("rgc.utils.data.pd.DataFrame.to_html")
+@patch("rgc.utils.data.os.path.join", return_value="/mocked/path/catalog.html")
+def test_dataframe_to_html(mock_join, mock_to_html, mock_mkdir):
+    # Sample catalog
+    catalog = pd.DataFrame({"object_name": ["Object1", "Object2"], "ra": [10.5, 20.3], "dec": [-30.1, 45.2]})
+
+    save_dir = "/mocked/directory"
+
+    # Run the function
+    dataframe_to_html(catalog, save_dir)
+
+    # Check that the directory was created
+    mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
+
+    # Check that the catalog was saved as HTML
+    mock_to_html.assert_called_once_with("/mocked/path/catalog.html")
+
+    # Check that os.path.join was called with the correct parameters
+    mock_join.assert_called_once_with(save_dir, "catalog.html")
diff --git a/tests/test_remove_artifacts.py b/tests/test_remove_artifacts.py
@@ -0,0 +1,43 @@
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+from rgc.utils.data import remove_artifacts
+
+
+class TestRemoveArtifacts(unittest.TestCase):
+    def setUp(self):
+        # Create a temporary directory
+        self.test_dir = tempfile.TemporaryDirectory()
+
+        # Create test files in the temporary directory
+        self.test_files = ["file1.txt", "file2.jpg", "file3.txt", "file4.png", "file5.csv"]
+        for file_name in self.test_files:
+            Path(os.path.join(self.test_dir.name, file_name)).touch()
+
+    def tearDown(self):
+        # Clean up the temporary directory
+        self.test_dir.cleanup()
+
+    def test_remove_artifacts(self):
+        # Define extensions to keep
+        extensions_to_keep = [".txt", ".jpg"]
+
+        # Run the function
+        remove_artifacts(self.test_dir.name, extensions_to_keep)
+
+        # List remaining files
+        remaining_files = os.listdir(self.test_dir.name)
+
+        # Check that only .txt and .jpg files are kept
+        expected_remaining_files = ["file1.txt", "file2.jpg", "file3.txt"]
+        self.assertEqual(sorted(remaining_files), sorted(expected_remaining_files))
+
+        # Check that other files are removed
+        self.assertNotIn("file4.png", remaining_files)
+        self.assertNotIn("file5.csv", remaining_files)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ dependencies = [`
`17`	`17`	`"numpy>=1.24.4",`
`18`	`18`	`"pandas>=2.0.3",`
`19`	`19`	`"pillow>=10.4.0",`
	`20`	`+ "torch>=2.4.1",`
`20`	`21`	`]`
`21`	`22`
`22`	`23`	`[project.urls]`