-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Setup CI/CD Pipeline for MSStats HPC (#143)
* Setup CI/CD Pipeline for MSStats
- Loading branch information
1 parent
f494c87
commit b21a35e
Showing
3 changed files
with
287 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
name: Run Simple R Script on HPC via Slurm | ||
|
||
on: | ||
push: | ||
branches: | ||
- feature/ci-cd-pipeline | ||
# - devel | ||
|
||
jobs: | ||
Benchmarking-pipeline: | ||
runs-on: ubuntu-latest | ||
env: | ||
FDR_THRESHOLD: 0.21 | ||
|
||
steps: | ||
- name: Checkout Repository | ||
uses: actions/checkout@v3 | ||
|
||
- name: Set Up SSH Access | ||
run: | | ||
mkdir -p ~/.ssh | ||
touch ~/.ssh/id_rsa | ||
chmod 600 ~/.ssh/id_rsa | ||
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa | ||
ssh-keyscan -H login-00.discovery.neu.edu >> ~/.ssh/known_hosts || exit 1 | ||
- name: Transfer Files to HPC | ||
run: | | ||
scp benchmark/benchmark.R benchmark/config.slurm raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking || exit 1 | ||
- name: Submit Slurm Job and Capture Job ID | ||
id: submit_job | ||
run: | | ||
ssh raina.ans@login-00.discovery.neu.edu "cd /work/VitekLab/Projects/Benchmarking && sbatch config.slurm" | tee slurm_job_id.txt | ||
slurm_job_id=$(grep -oP '\d+' slurm_job_id.txt) | ||
echo "Slurm Job ID is $slurm_job_id" | ||
echo "slurm_job_id=$slurm_job_id" >> $GITHUB_ENV | ||
- name: Monitor Slurm Job | ||
run: | | ||
ssh raina.ans@login-00.discovery.neu.edu " | ||
while squeue -j ${{ env.slurm_job_id }} | grep -q ${{ env.slurm_job_id }}; do | ||
echo 'Job Id : ${{ env.slurm_job_id }} is still running...' | ||
sleep 10 | ||
done | ||
echo 'Job has completed.' | ||
" | ||
- name: Fetch Output | ||
run: | | ||
scp raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking/job_output.txt job_output.txt | ||
scp raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking/job_error.txt job_error.txt | ||
- name: Extract and Check FDR Values | ||
run: | | ||
grep -A 4 "FPR Accuracy Recall FDR" job_output.txt > results_section.txt | ||
if awk -v threshold="$FDR_THRESHOLD" ' | ||
NR > 1 { | ||
if ($5 > threshold) { | ||
print "Error: FDR too high in row " NR " with FDR=" $5 | ||
exit 1 | ||
} | ||
} | ||
' results_section.txt; then | ||
echo "All FDR values are within acceptable range." | ||
else | ||
exit 1 | ||
fi | ||
- name: Upload Output as Artifact | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: benchmark-output | ||
path: | | ||
job_output.txt | ||
job_error.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
library(MSstatsConvert) | ||
library(MSstats) | ||
library(ggplot2) | ||
library(dplyr) | ||
library(stringr) | ||
library(parallel) | ||
|
||
|
||
calculateResult <- function(summarized, label){ | ||
|
||
model = groupComparison("pairwise", summarized) | ||
comparisonResult <- model$ComparisonResult | ||
|
||
human_comparisonResult <- comparisonResult %>% filter(grepl("_HUMAN$", Protein)) | ||
|
||
ecoli_comparisonResult <- comparisonResult %>% filter(grepl("_ECOLI$", Protein)) | ||
|
||
yeast_comparisonResult <- comparisonResult %>% filter(grepl("_YEAST$", Protein)) | ||
|
||
|
||
human_median <- median(human_comparisonResult$log2FC, na.rm = TRUE) | ||
ecoli_median <- median(ecoli_comparisonResult$log2FC, na.rm = TRUE) | ||
yeast_median <- median(yeast_comparisonResult$log2FC, na.rm = TRUE) | ||
|
||
cat("Expected Log Change Human:", human_median, "\n") | ||
cat("Expected Log Change Ecoli:", ecoli_median, "\n") | ||
cat("Expected Log Change Yeast:", yeast_median, "\n") | ||
|
||
boxplot(ecoli_comparisonResult$log2FC, | ||
main = "Boxplot of log2FC for E. coli", | ||
ylab = "log2FC", | ||
col = "lightgreen") | ||
combined_data <- list( | ||
Human = human_comparisonResult$log2FC, | ||
Ecoli = ecoli_comparisonResult$log2FC, | ||
Yeast = yeast_comparisonResult$log2FC | ||
) | ||
|
||
|
||
unique_ecoli_proteins <- unique(ecoli_comparisonResult$Protein) | ||
unique_yeast_proteins <- unique(yeast_comparisonResult$Protein) | ||
|
||
all_proteins <- c(union(unique_ecoli_proteins, unique_yeast_proteins)) | ||
|
||
extracted_proteins <- sapply(all_proteins, function(x) { | ||
split_string <- strsplit(x, "\\|")[[1]] | ||
if (length(split_string) >= 2) { | ||
return(split_string[2]) | ||
} else { | ||
return(NA) | ||
} | ||
}) | ||
|
||
extracted_proteins <- unname(unlist(extracted_proteins)) | ||
|
||
proteins <- c(extracted_proteins) | ||
|
||
|
||
TP <- comparisonResult %>% filter(grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue < 0.05) %>% nrow() | ||
|
||
|
||
FP <- comparisonResult %>% filter(!grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue < 0.05) %>% nrow() | ||
|
||
|
||
TN <- comparisonResult %>% filter(!grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue >= 0.05) %>% nrow() | ||
|
||
|
||
FN <- comparisonResult %>% filter(grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue >= 0.05) %>% nrow() | ||
|
||
cat("True Positives (Yeast and EColi): ", TP, "\n") | ||
cat("False Positives (Human Samples)", FP, "\n") | ||
cat("True Negatives", TN, "\n") | ||
cat("False Negatives", FN, "\n") | ||
|
||
ecoli_comparisonResult %>% | ||
filter(is.finite(log2FC)) %>% | ||
ggplot() + geom_boxplot(aes(y=log2FC)) + | ||
geom_hline(aes(yintercept=-2), linetype="dashed", color="red", linewidth=2) + | ||
theme_bw() + | ||
labs(title = "Boxplot of log2FC for E. coli", | ||
y = "log2FC") | ||
|
||
human_comparisonResult %>% | ||
filter(is.finite(log2FC)) %>% | ||
ggplot() + geom_boxplot(aes(y=log2FC)) + | ||
geom_hline(aes(yintercept=-2), linetype="dashed", color="blue", linewidth=2) + | ||
theme_bw() + | ||
labs(title = "Boxplot of log2FC for Human", | ||
y = "log2FC") | ||
|
||
|
||
yeast_comparisonResult %>% | ||
filter(is.finite(log2FC)) %>% | ||
ggplot() + geom_boxplot(aes(y=log2FC)) + | ||
geom_hline(aes(yintercept=-2), linetype="dashed", color="green", linewidth=2) + | ||
theme_bw() + | ||
labs(title = "Boxplot of log2FC for Yeast", | ||
y = "log2FC") | ||
|
||
FPR <- FP / (FP + TN) | ||
|
||
# Accuracy | ||
accuracy <- (TP + TN) / (TP + TN + FP + FN) | ||
|
||
# Recall | ||
recall <- TP / (TP + FN) | ||
|
||
# False Discover Rate (FDR) | ||
|
||
fdr <- FP/ (FP + TP) | ||
|
||
results <- data.frame( | ||
Label = label, | ||
TP = TP, | ||
FP = FP, | ||
TN = TN, | ||
FN = FN, | ||
FPR = FPR, | ||
Accuracy = accuracy, | ||
Recall = recall, | ||
FDR = fdr | ||
) | ||
|
||
return(results) | ||
|
||
} | ||
|
||
start_time <- Sys.time() | ||
|
||
fragpipe_raw = data.table::fread("/work/VitekLab/Data/MS/Benchmarking/DDA-Puyvelde2022/DDA-Puyvelde2022-HYE5600735_LFQ/FragPipe/TOP0/MSstats.csv") | ||
|
||
head(fragpipe_raw) | ||
|
||
fragpipe_raw$Condition = unlist(lapply(fragpipe_raw$Run, function(x){ | ||
paste(str_split(x, "\\_")[[1]][4:5], collapse="_") | ||
})) | ||
|
||
fragpipe_raw$BioReplicate = unlist(lapply(fragpipe_raw$Run, function(x){ | ||
paste(str_split(x, "\\_")[[1]][4:7], collapse="_") | ||
})) | ||
|
||
msstats_format = MSstatsConvert::FragPipetoMSstatsFormat(fragpipe_raw, use_log_file = FALSE) | ||
|
||
|
||
data_process_tasks <- list( | ||
list( | ||
label = "Data process with Normalized Data", | ||
result = function() dataProcess(msstats_format, featureSubset = "topN", n_top_feature = 20) | ||
), | ||
list( | ||
label = "Data process with Normalization and MBImpute False", | ||
result = function() dataProcess(msstats_format, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE) | ||
), | ||
list( | ||
label = "Data process without Normalization", | ||
result = function() dataProcess(msstats_format, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20) | ||
), | ||
list( | ||
label = "Data process without Normalization with MBImpute False", | ||
result = function() dataProcess(msstats_format, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE) | ||
) | ||
) | ||
|
||
start_time <- Sys.time() | ||
|
||
num_cores <- detectCores() - 1 | ||
|
||
summarized_results <- mclapply(data_process_tasks, function(task) { | ||
list(label = task$label, summarized = task$result()) | ||
}, mc.cores = num_cores) | ||
|
||
results_list <- mclapply(summarized_results, function(res) { | ||
calculateResult(res$summarized, res$label) | ||
}, mc.cores = num_cores) | ||
|
||
final_results <- do.call(rbind, results_list) | ||
end_time <- Sys.time() | ||
total_time <- end_time - start_time | ||
print(final_results) | ||
print(paste("Total Execution Time:", total_time)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=msstats_benchmark_job | ||
#SBATCH --chdir=/work/VitekLab/Projects/Benchmarking/ | ||
#SBATCH --output=job_output.txt | ||
#SBATCH --error=job_error.txt | ||
#SBATCH --time=01:00:00 # Set the maximum run time | ||
#SBATCH --ntasks=1 # Number of tasks (one process) | ||
#SBATCH --cpus-per-task=8 # Use 8 CPU cores for the task | ||
#SBATCH --mem=256G # Request 256GB of memory | ||
#SBATCH --partition=short # Use the 'short' partition (or change as needed) | ||
|
||
module load R-geospatial | ||
|
||
|
||
module load gcc/11.1.0 | ||
module load cmake/3.23.2 | ||
|
||
export LC_ALL=C | ||
export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.2-geospatial | ||
|
||
|
||
mkdir -p $R_LIBS_USER | ||
|
||
module load R | ||
Rscript -e "if (!requireNamespace('remotes', quietly = TRUE)) install.packages('remotes', lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org'); \ | ||
remotes::install_github('Vitek-Lab/MSstats', ref = 'devel', lib = Sys.getenv('R_LIBS_USER')); \ | ||
remotes::install_github('Vitek-Lab/MSstatsConvert', ref = 'master', lib = Sys.getenv('R_LIBS_USER')); \ | ||
install.packages(c('dplyr', 'stringr', 'ggplot2'), lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org')" | ||
|
||
Rscript benchmark.R |