Skip to content

Commit

Permalink
added documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
axiomcura committed Jan 22, 2025
1 parent 63ada14 commit 45b76fb
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 14 deletions.
14 changes: 12 additions & 2 deletions pycytominer/cyto_utils/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,19 +173,29 @@ def drop_outlier_features(
Features greater than the threshold.
"""

# Subset dataframe
# Subset the DataFrame if specific samples are specified
# If "all", use the entire DataFrame without subsetting
if samples != "all":
population_df = population_df.query(samples)
# Using pandas query to filter rows based on the conditions provided in the
# samples parameter
population_df = population_df.query(expr=samples)

# Infer CellProfiler features if 'features' is set to 'infer'
if features == "infer":
# Infer CellProfiler features
features = infer_cp_features(population_df)
# Subset the DataFrame to only include inferred CellProfiler features
population_df = population_df.loc[:, features]
else:
# Subset the DataFrame to only include the features of interest
# this would be more tailored to non-CellProfiler features
population_df = population_df.loc[:, features]

# Get the max and min values for each feature
max_feature_values = population_df.max().abs()
min_feature_values = population_df.min().abs()

# Identify features with max or min values greater than the outlier cutoff
outlier_features = max_feature_values[
(max_feature_values > outlier_cutoff) | (min_feature_values > outlier_cutoff)
].index.tolist()
Expand Down
11 changes: 9 additions & 2 deletions pycytominer/operations/correlation_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,26 @@ def correlation_threshold(
List of features to exclude from the population_df.
"""

# Check that the input method is supported
# Checking if the provided correlation method is supported
method = check_correlation_method(method)

# Checking if the threshold is between 0 and 1
if not 0 <= threshold <= 1:
raise ValueError("threshold variable must be between (0 and 1)")

# Subset dataframe and calculate correlation matrix across subset features
# If samples is not 'all', then subset the dataframe
if samples != "all":
population_df = population_df.query(samples)
# Using pandas query to filter rows based on the conditions provided in the
# samples parameter
population_df = population_df.query(expr=samples)

# Infer CellProfiler features if 'features' is set to 'infer'
if features == "infer":
# Infer CellProfiler features
features = infer_cp_features(population_df)

# Subset the DataFrame to only include the features of interest
population_df = population_df.loc[:, features]

# Get correlation matrix and lower triangle of pairwise correlations in long format
Expand Down
15 changes: 14 additions & 1 deletion pycytominer/operations/get_na_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,29 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05):
List of features to exclude from the population_df.
"""

# Checking if the cutoff is between 0 and 1
if not 0 <= cutoff <= 1:
raise ValueError("cutoff variable must be between (0 and 1)")

# Subset the DataFrame if specific samples are specified
# If "all", use the entire DataFrame without subsetting
if samples != "all":
population_df = population_df.query(samples)
# Using pandas query to filter rows based on the conditions provided in the
# samples parameter
population_df = population_df.query(expr=samples)

# Infer CellProfiler features if 'features' is set to 'infer'
if features == "infer":
# Infer CellProfiler features
features = infer_cp_features(population_df)

# Subset the DataFrame to only include the features of interest
population_df = population_df.loc[:, features]

# Get the proportion of NA values for each feature
num_rows = population_df.shape[0]
na_prop_df = population_df.isna().sum() / num_rows

# Get the features that have more NA values than the cutoff
na_prop_df = na_prop_df[na_prop_df > cutoff]
return list(set(na_prop_df.index.tolist()))
29 changes: 24 additions & 5 deletions pycytominer/operations/noise_removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,30 +41,43 @@ def noise_removal(
A list of features to be removed, due to having too high standard deviation within replicate groups.
"""
# Subset dataframe

# Subset the DataFrame if specific samples are specified
# If "all", use the entire DataFrame without subsetting
if samples != "all":
population_df = population_df.query(samples)
# Using pandas query to filter rows based on the conditions provided in the
# samples parameter
population_df = population_df.query(expr=samples)

# Infer CellProfiler features if 'features' is set to 'infer'
if features == "infer":
# Infer CellProfiler features
features = infer_cp_features(population_df)
# Subset the DataFrame to only include inferred CellProfiler features

# If a metadata column name is specified, use that as the perturb groups
# if a Metadata columns name is specified, use that as the perturb groups
if isinstance(noise_removal_perturb_groups, str):
# Check if the column exists
if noise_removal_perturb_groups not in population_df.columns:
raise ValueError(
'f"{perturb} not found. Are you sure it is a ' "metadata column?"
)
# Assign the group info to the specified column
group_info = population_df[noise_removal_perturb_groups]

# Otherwise, the user specifies a list of perturbs
elif isinstance(noise_removal_perturb_groups, list):
# Check if the length of the noise_removal_perturb_groups is the same as the
# number of rows in the df
if not len(noise_removal_perturb_groups) == len(population_df):
raise ValueError(
f"The length of input list: {len(noise_removal_perturb_groups)} is not equivalent to your "
f"data: {population_df.shape[0]}"
)
# Assign the group info to the the noise_removal_perturb_groups
group_info = noise_removal_perturb_groups
else:
# Raise an error if the input is not a list or a string
raise TypeError(
"noise_removal_perturb_groups must be a list corresponding to row perturbations or a str \
specifying the name of the metadata column."
Expand All @@ -74,10 +87,16 @@ def noise_removal(
population_df = population_df.loc[:, features]
population_df = population_df.assign(group_id=group_info)

# Get the standard deviations of features within each group
# Get the standard deviations of features within each group then calculate the mean
# of these standard deviations.
# This tells us how much the standard deviation of each feature varies within each
# perturbation group.
stdev_means_df = population_df.groupby("group_id").std(ddof=0).mean()

# Identify noisy features with a greater mean stdev within perturbation group than the threshold
# With the stdev_means_df, we can identify features that have a mean stdev greater than
# the cutoff
# These features are considered to have too much variation within replicate groups
# and are removed. This returns a list of features to remove.
to_remove = stdev_means_df[
stdev_means_df > noise_removal_stdev_cutoff
].index.tolist()
Expand Down
28 changes: 24 additions & 4 deletions pycytominer/operations/variance_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,37 +44,57 @@ def variance_threshold(
"""

# check if freq_cut and unique_cut are between 0 and 1
if not 0 <= freq_cut <= 1:
raise ValueError("freq_cut variable must be between (0 and 1)")
if not 0 <= unique_cut <= 1:
raise ValueError("unique_cut variable must be between (0 and 1)")

# Subset dataframe
# Subset the DataFrame if specific samples are specified
# If "all", use the entire DataFrame without subsetting
if samples != "all":
population_df = population_df.query(samples)
# Using pandas query to filter rows based on the conditions provided in the
# samples parameter
population_df = population_df.query(expr=samples)

# Infer CellProfiler features if 'features' is set to 'infer'
if features == "infer":
# Infer CellProfiler features
features = infer_cp_features(population_df)

# Subset the DataFrame to only include the features of interest
population_df = population_df.loc[:, features]

# Exclude features with extreme (defined by freq_cut ratio) common values
# Exclude features based on frequency
# Frequency is the ratio of the second most common value to the most common value.
# Features with a frequency below the `freq_cut` threshold are flagged for exclusion.
excluded_features_freq = population_df.apply(
lambda x: calculate_frequency(x, freq_cut), axis="rows"
)

# Remove features with NA values
excluded_features_freq = excluded_features_freq[
excluded_features_freq.isna()
].index.tolist()

# Exclude features with too many (defined by unique_ratio) values in common
# Get the number of samples
n = population_df.shape[0]

# Get the number of unique features
num_unique_features = population_df.nunique()

# Exclude features with too many (defined by unique_ratio) values in common, where
# unique_ratio is defined as the number of unique features divided by the total
# number of samples
unique_ratio = num_unique_features / n
unique_ratio = unique_ratio < unique_cut

# Get the feature names that have a unique ratio less than the unique_cut
# This represents features that have too few unique values compared to the number
# of samples.
excluded_features_unique = unique_ratio[unique_ratio].index.tolist()

# Combine the two lists of features to exclude
excluded_features = list(set(excluded_features_freq + excluded_features_unique))
return excluded_features

Expand Down

0 comments on commit 45b76fb

Please sign in to comment.