added documentation

cytomining · Jan 22, 2025 · 45b76fb · 45b76fb
1 parent 63ada14
commit 45b76fb
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 14 deletions.
diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py
@@ -173,19 +173,29 @@ def drop_outlier_features(
         Features greater than the threshold.
     """
 
-    # Subset dataframe
+    # Subset the DataFrame if specific samples are specified
+    # If "all", use the entire DataFrame without subsetting
     if samples != "all":
-        population_df = population_df.query(samples)
+        # Using pandas query to filter rows based on the conditions provided in the
+        # samples parameter
+        population_df = population_df.query(expr=samples)
 
+    # Infer  CellProfiler features if 'features' is set to 'infer'
     if features == "infer":
+        # Infer CellProfiler features
         features = infer_cp_features(population_df)
+        # Subset the DataFrame to only include inferred CellProfiler features
         population_df = population_df.loc[:, features]
     else:
+        # Subset the DataFrame to only include the features of interest
+        # this would be more tailored to non-CellProfiler features
         population_df = population_df.loc[:, features]
 
+    # Get the max and min values for each feature
     max_feature_values = population_df.max().abs()
     min_feature_values = population_df.min().abs()
 
+    # Identify features with max or min values greater than the outlier cutoff
     outlier_features = max_feature_values[
         (max_feature_values > outlier_cutoff) | (min_feature_values > outlier_cutoff)
     ].index.tolist()

diff --git a/pycytominer/operations/correlation_threshold.py b/pycytominer/operations/correlation_threshold.py
@@ -40,19 +40,26 @@ def correlation_threshold(
          List of features to exclude from the population_df.
     """
 
-    # Check that the input method is supported
+    # Checking if the provided correlation method is supported
     method = check_correlation_method(method)
 
+    # Checking if the threshold is between 0 and 1
     if not 0 <= threshold <= 1:
         raise ValueError("threshold variable must be between (0 and 1)")
 
     # Subset dataframe and calculate correlation matrix across subset features
+    # If samples is not 'all', then subset the dataframe
     if samples != "all":
-        population_df = population_df.query(samples)
+        # Using pandas query to filter rows based on the conditions provided in the
+        # samples parameter
+        population_df = population_df.query(expr=samples)
 
+    # Infer CellProfiler features if 'features' is set to 'infer'
     if features == "infer":
+        # Infer CellProfiler features
         features = infer_cp_features(population_df)
 
+    # Subset the DataFrame to only include the features of interest
     population_df = population_df.loc[:, features]
 
     # Get correlation matrix and lower triangle of pairwise correlations in long format

diff --git a/pycytominer/operations/get_na_columns.py b/pycytominer/operations/get_na_columns.py
@@ -32,16 +32,29 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05):
          List of features to exclude from the population_df.
     """
 
+    # Checking if the cutoff is between 0 and 1
+    if not 0 <= cutoff <= 1:
+        raise ValueError("cutoff variable must be between (0 and 1)")
+
+    # Subset the DataFrame if specific samples are specified
+    # If "all", use the entire DataFrame without subsetting
     if samples != "all":
-        population_df = population_df.query(samples)
+        # Using pandas query to filter rows based on the conditions provided in the
+        # samples parameter
+        population_df = population_df.query(expr=samples)
 
+    # Infer  CellProfiler features if 'features' is set to 'infer'
     if features == "infer":
+        # Infer CellProfiler features
         features = infer_cp_features(population_df)
 
+    # Subset the DataFrame to only include the features of interest
     population_df = population_df.loc[:, features]
 
+    # Get the proportion of NA values for each feature
     num_rows = population_df.shape[0]
     na_prop_df = population_df.isna().sum() / num_rows
 
+    # Get the features that have more NA values than the cutoff
     na_prop_df = na_prop_df[na_prop_df > cutoff]
     return list(set(na_prop_df.index.tolist()))
diff --git a/pycytominer/operations/noise_removal.py b/pycytominer/operations/noise_removal.py
@@ -41,30 +41,43 @@ def noise_removal(
         A list of features to be removed, due to having too high standard deviation within replicate groups.
 
     """
-    # Subset dataframe
+
+    # Subset the DataFrame if specific samples are specified
+    # If "all", use the entire DataFrame without subsetting
     if samples != "all":
-        population_df = population_df.query(samples)
+        # Using pandas query to filter rows based on the conditions provided in the
+        # samples parameter
+        population_df = population_df.query(expr=samples)
 
+    # Infer  CellProfiler features if 'features' is set to 'infer'
     if features == "infer":
+        # Infer CellProfiler features
         features = infer_cp_features(population_df)
+        # Subset the DataFrame to only include inferred CellProfiler features
 
-    # If a metadata column name is specified, use that as the perturb groups
+    # if a Metadata columns name is specified, use that as the perturb groups
     if isinstance(noise_removal_perturb_groups, str):
+        # Check if the column exists
         if noise_removal_perturb_groups not in population_df.columns:
             raise ValueError(
                 'f"{perturb} not found. Are you sure it is a ' "metadata column?"
             )
+        # Assign the group info to the specified column
         group_info = population_df[noise_removal_perturb_groups]
 
     # Otherwise, the user specifies a list of perturbs
     elif isinstance(noise_removal_perturb_groups, list):
+        # Check if the length of the noise_removal_perturb_groups is the same as the
+        # number of rows in the df
         if not len(noise_removal_perturb_groups) == len(population_df):
             raise ValueError(
                 f"The length of input list: {len(noise_removal_perturb_groups)} is not equivalent to your "
                 f"data: {population_df.shape[0]}"
             )
+        # Assign the group info to the the noise_removal_perturb_groups
         group_info = noise_removal_perturb_groups
     else:
+        # Raise an error if the input is not a list or a string
         raise TypeError(
             "noise_removal_perturb_groups must be a list corresponding to row perturbations or a str \
                         specifying the name of the metadata column."
@@ -74,10 +87,16 @@ def noise_removal(
     population_df = population_df.loc[:, features]
     population_df = population_df.assign(group_id=group_info)
 
-    # Get the standard deviations of features within each group
+    # Get the standard deviations of features within each group then calculate the mean
+    # of these standard deviations.
+    # This tells us how much the standard deviation of each feature varies within each
+    # perturbation group.
     stdev_means_df = population_df.groupby("group_id").std(ddof=0).mean()
 
-    # Identify noisy features with a greater mean stdev within perturbation group than the threshold
+    # With the stdev_means_df, we can identify features that have a mean stdev greater than
+    # the cutoff
+    # These features are considered to have too much variation within replicate groups
+    # and are removed. This returns a list of features to remove.
     to_remove = stdev_means_df[
         stdev_means_df > noise_removal_stdev_cutoff
     ].index.tolist()

diff --git a/pycytominer/operations/variance_threshold.py b/pycytominer/operations/variance_threshold.py
@@ -44,37 +44,57 @@ def variance_threshold(
 
     """
 
+    # check if freq_cut and unique_cut are between 0 and 1
     if not 0 <= freq_cut <= 1:
         raise ValueError("freq_cut variable must be between (0 and 1)")
     if not 0 <= unique_cut <= 1:
         raise ValueError("unique_cut variable must be between (0 and 1)")
 
-    # Subset dataframe
+    # Subset the DataFrame if specific samples are specified
+    # If "all", use the entire DataFrame without subsetting
     if samples != "all":
-        population_df = population_df.query(samples)
+        # Using pandas query to filter rows based on the conditions provided in the
+        # samples parameter
+        population_df = population_df.query(expr=samples)
 
+    # Infer CellProfiler features if 'features' is set to 'infer'
     if features == "infer":
+        # Infer CellProfiler features
         features = infer_cp_features(population_df)
 
+    # Subset the DataFrame to only include the features of interest
     population_df = population_df.loc[:, features]
 
-    # Exclude features with extreme (defined by freq_cut ratio) common values
+    # Exclude features based on frequency
+    # Frequency is the ratio of the second most common value to the most common value.
+    # Features with a frequency below the `freq_cut` threshold are flagged for exclusion.
     excluded_features_freq = population_df.apply(
         lambda x: calculate_frequency(x, freq_cut), axis="rows"
     )
 
+    # Remove features with NA values
     excluded_features_freq = excluded_features_freq[
         excluded_features_freq.isna()
     ].index.tolist()
 
-    # Exclude features with too many (defined by unique_ratio) values in common
+    # Get the number of samples
     n = population_df.shape[0]
+
+    # Get the number of unique features
     num_unique_features = population_df.nunique()
 
+    # Exclude features with too many (defined by unique_ratio) values in common, where
+    # unique_ratio is defined as the number of unique features divided by the total
+    # number of samples
     unique_ratio = num_unique_features / n
     unique_ratio = unique_ratio < unique_cut
+
+    # Get the feature names that have a unique ratio less than the unique_cut
+    # This represents features that have too few unique values compared to the number
+    # of samples.
     excluded_features_unique = unique_ratio[unique_ratio].index.tolist()
 
+    # Combine the two lists of features to exclude
     excluded_features = list(set(excluded_features_freq + excluded_features_unique))
     return excluded_features