Skip to content

Commit

Permalink
Setup CI/CD Pipeline for MSStats HPC (#143)
Browse files Browse the repository at this point in the history
* Setup CI/CD Pipeline for MSStats
  • Loading branch information
anshuman-raina authored Nov 26, 2024
1 parent f494c87 commit b21a35e
Show file tree
Hide file tree
Showing 3 changed files with 287 additions and 0 deletions.
77 changes: 77 additions & 0 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
name: Run Simple R Script on HPC via Slurm

on:
push:
branches:
- feature/ci-cd-pipeline
# - devel

jobs:
Benchmarking-pipeline:
runs-on: ubuntu-latest
env:
FDR_THRESHOLD: 0.21

steps:
- name: Checkout Repository
uses: actions/checkout@v3

- name: Set Up SSH Access
run: |
mkdir -p ~/.ssh
touch ~/.ssh/id_rsa
chmod 600 ~/.ssh/id_rsa
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
ssh-keyscan -H login-00.discovery.neu.edu >> ~/.ssh/known_hosts || exit 1
- name: Transfer Files to HPC
run: |
scp benchmark/benchmark.R benchmark/config.slurm raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking || exit 1
- name: Submit Slurm Job and Capture Job ID
id: submit_job
run: |
ssh raina.ans@login-00.discovery.neu.edu "cd /work/VitekLab/Projects/Benchmarking && sbatch config.slurm" | tee slurm_job_id.txt
slurm_job_id=$(grep -oP '\d+' slurm_job_id.txt)
echo "Slurm Job ID is $slurm_job_id"
echo "slurm_job_id=$slurm_job_id" >> $GITHUB_ENV
- name: Monitor Slurm Job
run: |
ssh raina.ans@login-00.discovery.neu.edu "
while squeue -j ${{ env.slurm_job_id }} | grep -q ${{ env.slurm_job_id }}; do
echo 'Job Id : ${{ env.slurm_job_id }} is still running...'
sleep 10
done
echo 'Job has completed.'
"
- name: Fetch Output
run: |
scp raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking/job_output.txt job_output.txt
scp raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking/job_error.txt job_error.txt
- name: Extract and Check FDR Values
run: |
grep -A 4 "FPR Accuracy Recall FDR" job_output.txt > results_section.txt
if awk -v threshold="$FDR_THRESHOLD" '
NR > 1 {
if ($5 > threshold) {
print "Error: FDR too high in row " NR " with FDR=" $5
exit 1
}
}
' results_section.txt; then
echo "All FDR values are within acceptable range."
else
exit 1
fi
- name: Upload Output as Artifact
uses: actions/upload-artifact@v4
with:
name: benchmark-output
path: |
job_output.txt
job_error.txt
180 changes: 180 additions & 0 deletions benchmark/benchmark.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
library(MSstatsConvert)
library(MSstats)
library(ggplot2)
library(dplyr)
library(stringr)
library(parallel)


calculateResult <- function(summarized, label){

model = groupComparison("pairwise", summarized)
comparisonResult <- model$ComparisonResult

human_comparisonResult <- comparisonResult %>% filter(grepl("_HUMAN$", Protein))

ecoli_comparisonResult <- comparisonResult %>% filter(grepl("_ECOLI$", Protein))

yeast_comparisonResult <- comparisonResult %>% filter(grepl("_YEAST$", Protein))


human_median <- median(human_comparisonResult$log2FC, na.rm = TRUE)
ecoli_median <- median(ecoli_comparisonResult$log2FC, na.rm = TRUE)
yeast_median <- median(yeast_comparisonResult$log2FC, na.rm = TRUE)

cat("Expected Log Change Human:", human_median, "\n")
cat("Expected Log Change Ecoli:", ecoli_median, "\n")
cat("Expected Log Change Yeast:", yeast_median, "\n")

boxplot(ecoli_comparisonResult$log2FC,
main = "Boxplot of log2FC for E. coli",
ylab = "log2FC",
col = "lightgreen")
combined_data <- list(
Human = human_comparisonResult$log2FC,
Ecoli = ecoli_comparisonResult$log2FC,
Yeast = yeast_comparisonResult$log2FC
)


unique_ecoli_proteins <- unique(ecoli_comparisonResult$Protein)
unique_yeast_proteins <- unique(yeast_comparisonResult$Protein)

all_proteins <- c(union(unique_ecoli_proteins, unique_yeast_proteins))

extracted_proteins <- sapply(all_proteins, function(x) {
split_string <- strsplit(x, "\\|")[[1]]
if (length(split_string) >= 2) {
return(split_string[2])
} else {
return(NA)
}
})

extracted_proteins <- unname(unlist(extracted_proteins))

proteins <- c(extracted_proteins)


TP <- comparisonResult %>% filter(grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue < 0.05) %>% nrow()


FP <- comparisonResult %>% filter(!grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue < 0.05) %>% nrow()


TN <- comparisonResult %>% filter(!grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue >= 0.05) %>% nrow()


FN <- comparisonResult %>% filter(grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue >= 0.05) %>% nrow()

cat("True Positives (Yeast and EColi): ", TP, "\n")
cat("False Positives (Human Samples)", FP, "\n")
cat("True Negatives", TN, "\n")
cat("False Negatives", FN, "\n")

ecoli_comparisonResult %>%
filter(is.finite(log2FC)) %>%
ggplot() + geom_boxplot(aes(y=log2FC)) +
geom_hline(aes(yintercept=-2), linetype="dashed", color="red", linewidth=2) +
theme_bw() +
labs(title = "Boxplot of log2FC for E. coli",
y = "log2FC")

human_comparisonResult %>%
filter(is.finite(log2FC)) %>%
ggplot() + geom_boxplot(aes(y=log2FC)) +
geom_hline(aes(yintercept=-2), linetype="dashed", color="blue", linewidth=2) +
theme_bw() +
labs(title = "Boxplot of log2FC for Human",
y = "log2FC")


yeast_comparisonResult %>%
filter(is.finite(log2FC)) %>%
ggplot() + geom_boxplot(aes(y=log2FC)) +
geom_hline(aes(yintercept=-2), linetype="dashed", color="green", linewidth=2) +
theme_bw() +
labs(title = "Boxplot of log2FC for Yeast",
y = "log2FC")

FPR <- FP / (FP + TN)

# Accuracy
accuracy <- (TP + TN) / (TP + TN + FP + FN)

# Recall
recall <- TP / (TP + FN)

# False Discover Rate (FDR)

fdr <- FP/ (FP + TP)

results <- data.frame(
Label = label,
TP = TP,
FP = FP,
TN = TN,
FN = FN,
FPR = FPR,
Accuracy = accuracy,
Recall = recall,
FDR = fdr
)

return(results)

}

start_time <- Sys.time()

fragpipe_raw = data.table::fread("/work/VitekLab/Data/MS/Benchmarking/DDA-Puyvelde2022/DDA-Puyvelde2022-HYE5600735_LFQ/FragPipe/TOP0/MSstats.csv")

head(fragpipe_raw)

fragpipe_raw$Condition = unlist(lapply(fragpipe_raw$Run, function(x){
paste(str_split(x, "\\_")[[1]][4:5], collapse="_")
}))

fragpipe_raw$BioReplicate = unlist(lapply(fragpipe_raw$Run, function(x){
paste(str_split(x, "\\_")[[1]][4:7], collapse="_")
}))

msstats_format = MSstatsConvert::FragPipetoMSstatsFormat(fragpipe_raw, use_log_file = FALSE)


data_process_tasks <- list(
list(
label = "Data process with Normalized Data",
result = function() dataProcess(msstats_format, featureSubset = "topN", n_top_feature = 20)
),
list(
label = "Data process with Normalization and MBImpute False",
result = function() dataProcess(msstats_format, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE)
),
list(
label = "Data process without Normalization",
result = function() dataProcess(msstats_format, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20)
),
list(
label = "Data process without Normalization with MBImpute False",
result = function() dataProcess(msstats_format, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE)
)
)

start_time <- Sys.time()

num_cores <- detectCores() - 1

summarized_results <- mclapply(data_process_tasks, function(task) {
list(label = task$label, summarized = task$result())
}, mc.cores = num_cores)

results_list <- mclapply(summarized_results, function(res) {
calculateResult(res$summarized, res$label)
}, mc.cores = num_cores)

final_results <- do.call(rbind, results_list)
end_time <- Sys.time()
total_time <- end_time - start_time
print(final_results)
print(paste("Total Execution Time:", total_time))
30 changes: 30 additions & 0 deletions benchmark/config.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash
#SBATCH --job-name=msstats_benchmark_job
#SBATCH --chdir=/work/VitekLab/Projects/Benchmarking/
#SBATCH --output=job_output.txt
#SBATCH --error=job_error.txt
#SBATCH --time=01:00:00 # Set the maximum run time
#SBATCH --ntasks=1 # Number of tasks (one process)
#SBATCH --cpus-per-task=8 # Use 8 CPU cores for the task
#SBATCH --mem=256G # Request 256GB of memory
#SBATCH --partition=short # Use the 'short' partition (or change as needed)

module load R-geospatial


module load gcc/11.1.0
module load cmake/3.23.2

export LC_ALL=C
export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.2-geospatial


mkdir -p $R_LIBS_USER

module load R
Rscript -e "if (!requireNamespace('remotes', quietly = TRUE)) install.packages('remotes', lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org'); \
remotes::install_github('Vitek-Lab/MSstats', ref = 'devel', lib = Sys.getenv('R_LIBS_USER')); \
remotes::install_github('Vitek-Lab/MSstatsConvert', ref = 'master', lib = Sys.getenv('R_LIBS_USER')); \
install.packages(c('dplyr', 'stringr', 'ggplot2'), lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org')"

Rscript benchmark.R

0 comments on commit b21a35e

Please sign in to comment.