diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 00000000..1d00ad10 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,77 @@ +name: Run Simple R Script on HPC via Slurm + +on: + push: + branches: + - feature/ci-cd-pipeline + # - devel + +jobs: + Benchmarking-pipeline: + runs-on: ubuntu-latest + env: + FDR_THRESHOLD: 0.21 + + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + + - name: Set Up SSH Access + run: | + mkdir -p ~/.ssh + touch ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa + ssh-keyscan -H login-00.discovery.neu.edu >> ~/.ssh/known_hosts || exit 1 + + - name: Transfer Files to HPC + run: | + scp benchmark/benchmark.R benchmark/config.slurm raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking || exit 1 + + - name: Submit Slurm Job and Capture Job ID + id: submit_job + run: | + ssh raina.ans@login-00.discovery.neu.edu "cd /work/VitekLab/Projects/Benchmarking && sbatch config.slurm" | tee slurm_job_id.txt + slurm_job_id=$(grep -oP '\d+' slurm_job_id.txt) + echo "Slurm Job ID is $slurm_job_id" + echo "slurm_job_id=$slurm_job_id" >> $GITHUB_ENV + + - name: Monitor Slurm Job + run: | + ssh raina.ans@login-00.discovery.neu.edu " + while squeue -j ${{ env.slurm_job_id }} | grep -q ${{ env.slurm_job_id }}; do + echo 'Job Id : ${{ env.slurm_job_id }} is still running...' + sleep 10 + done + echo 'Job has completed.' + " + + - name: Fetch Output + run: | + scp raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking/job_output.txt job_output.txt + scp raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking/job_error.txt job_error.txt + + - name: Extract and Check FDR Values + run: | + grep -A 4 "FPR Accuracy Recall FDR" job_output.txt > results_section.txt + + if awk -v threshold="$FDR_THRESHOLD" ' + NR > 1 { + if ($5 > threshold) { + print "Error: FDR too high in row " NR " with FDR=" $5 + exit 1 + } + } + ' results_section.txt; then + echo "All FDR values are within acceptable range." + else + exit 1 + fi + + - name: Upload Output as Artifact + uses: actions/upload-artifact@v4 + with: + name: benchmark-output + path: | + job_output.txt + job_error.txt diff --git a/benchmark/benchmark.R b/benchmark/benchmark.R new file mode 100644 index 00000000..77c5605a --- /dev/null +++ b/benchmark/benchmark.R @@ -0,0 +1,180 @@ +library(MSstatsConvert) +library(MSstats) +library(ggplot2) +library(dplyr) +library(stringr) +library(parallel) + + +calculateResult <- function(summarized, label){ + + model = groupComparison("pairwise", summarized) + comparisonResult <- model$ComparisonResult + + human_comparisonResult <- comparisonResult %>% filter(grepl("_HUMAN$", Protein)) + + ecoli_comparisonResult <- comparisonResult %>% filter(grepl("_ECOLI$", Protein)) + + yeast_comparisonResult <- comparisonResult %>% filter(grepl("_YEAST$", Protein)) + + + human_median <- median(human_comparisonResult$log2FC, na.rm = TRUE) + ecoli_median <- median(ecoli_comparisonResult$log2FC, na.rm = TRUE) + yeast_median <- median(yeast_comparisonResult$log2FC, na.rm = TRUE) + + cat("Expected Log Change Human:", human_median, "\n") + cat("Expected Log Change Ecoli:", ecoli_median, "\n") + cat("Expected Log Change Yeast:", yeast_median, "\n") + + boxplot(ecoli_comparisonResult$log2FC, + main = "Boxplot of log2FC for E. coli", + ylab = "log2FC", + col = "lightgreen") + combined_data <- list( + Human = human_comparisonResult$log2FC, + Ecoli = ecoli_comparisonResult$log2FC, + Yeast = yeast_comparisonResult$log2FC + ) + + + unique_ecoli_proteins <- unique(ecoli_comparisonResult$Protein) + unique_yeast_proteins <- unique(yeast_comparisonResult$Protein) + + all_proteins <- c(union(unique_ecoli_proteins, unique_yeast_proteins)) + + extracted_proteins <- sapply(all_proteins, function(x) { + split_string <- strsplit(x, "\\|")[[1]] + if (length(split_string) >= 2) { + return(split_string[2]) + } else { + return(NA) + } + }) + + extracted_proteins <- unname(unlist(extracted_proteins)) + + proteins <- c(extracted_proteins) + + + TP <- comparisonResult %>% filter(grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue < 0.05) %>% nrow() + + + FP <- comparisonResult %>% filter(!grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue < 0.05) %>% nrow() + + + TN <- comparisonResult %>% filter(!grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue >= 0.05) %>% nrow() + + + FN <- comparisonResult %>% filter(grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue >= 0.05) %>% nrow() + + cat("True Positives (Yeast and EColi): ", TP, "\n") + cat("False Positives (Human Samples)", FP, "\n") + cat("True Negatives", TN, "\n") + cat("False Negatives", FN, "\n") + + ecoli_comparisonResult %>% + filter(is.finite(log2FC)) %>% + ggplot() + geom_boxplot(aes(y=log2FC)) + + geom_hline(aes(yintercept=-2), linetype="dashed", color="red", linewidth=2) + + theme_bw() + + labs(title = "Boxplot of log2FC for E. coli", + y = "log2FC") + + human_comparisonResult %>% + filter(is.finite(log2FC)) %>% + ggplot() + geom_boxplot(aes(y=log2FC)) + + geom_hline(aes(yintercept=-2), linetype="dashed", color="blue", linewidth=2) + + theme_bw() + + labs(title = "Boxplot of log2FC for Human", + y = "log2FC") + + + yeast_comparisonResult %>% + filter(is.finite(log2FC)) %>% + ggplot() + geom_boxplot(aes(y=log2FC)) + + geom_hline(aes(yintercept=-2), linetype="dashed", color="green", linewidth=2) + + theme_bw() + + labs(title = "Boxplot of log2FC for Yeast", + y = "log2FC") + + FPR <- FP / (FP + TN) + + # Accuracy + accuracy <- (TP + TN) / (TP + TN + FP + FN) + + # Recall + recall <- TP / (TP + FN) + + # False Discover Rate (FDR) + + fdr <- FP/ (FP + TP) + + results <- data.frame( + Label = label, + TP = TP, + FP = FP, + TN = TN, + FN = FN, + FPR = FPR, + Accuracy = accuracy, + Recall = recall, + FDR = fdr + ) + + return(results) + +} + +start_time <- Sys.time() + +fragpipe_raw = data.table::fread("/work/VitekLab/Data/MS/Benchmarking/DDA-Puyvelde2022/DDA-Puyvelde2022-HYE5600735_LFQ/FragPipe/TOP0/MSstats.csv") + +head(fragpipe_raw) + +fragpipe_raw$Condition = unlist(lapply(fragpipe_raw$Run, function(x){ + paste(str_split(x, "\\_")[[1]][4:5], collapse="_") +})) + +fragpipe_raw$BioReplicate = unlist(lapply(fragpipe_raw$Run, function(x){ + paste(str_split(x, "\\_")[[1]][4:7], collapse="_") +})) + +msstats_format = MSstatsConvert::FragPipetoMSstatsFormat(fragpipe_raw, use_log_file = FALSE) + + +data_process_tasks <- list( + list( + label = "Data process with Normalized Data", + result = function() dataProcess(msstats_format, featureSubset = "topN", n_top_feature = 20) + ), + list( + label = "Data process with Normalization and MBImpute False", + result = function() dataProcess(msstats_format, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE) + ), + list( + label = "Data process without Normalization", + result = function() dataProcess(msstats_format, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20) + ), + list( + label = "Data process without Normalization with MBImpute False", + result = function() dataProcess(msstats_format, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE) + ) +) + +start_time <- Sys.time() + +num_cores <- detectCores() - 1 + +summarized_results <- mclapply(data_process_tasks, function(task) { + list(label = task$label, summarized = task$result()) +}, mc.cores = num_cores) + +results_list <- mclapply(summarized_results, function(res) { + calculateResult(res$summarized, res$label) +}, mc.cores = num_cores) + +final_results <- do.call(rbind, results_list) +end_time <- Sys.time() +total_time <- end_time - start_time +print(final_results) +print(paste("Total Execution Time:", total_time)) \ No newline at end of file diff --git a/benchmark/config.slurm b/benchmark/config.slurm new file mode 100644 index 00000000..e6864ead --- /dev/null +++ b/benchmark/config.slurm @@ -0,0 +1,30 @@ +#!/bin/bash +#SBATCH --job-name=msstats_benchmark_job +#SBATCH --chdir=/work/VitekLab/Projects/Benchmarking/ +#SBATCH --output=job_output.txt +#SBATCH --error=job_error.txt +#SBATCH --time=01:00:00 # Set the maximum run time +#SBATCH --ntasks=1 # Number of tasks (one process) +#SBATCH --cpus-per-task=8 # Use 8 CPU cores for the task +#SBATCH --mem=256G # Request 256GB of memory +#SBATCH --partition=short # Use the 'short' partition (or change as needed) + +module load R-geospatial + + +module load gcc/11.1.0 +module load cmake/3.23.2 + +export LC_ALL=C +export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.2-geospatial + + +mkdir -p $R_LIBS_USER + +module load R +Rscript -e "if (!requireNamespace('remotes', quietly = TRUE)) install.packages('remotes', lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org'); \ +remotes::install_github('Vitek-Lab/MSstats', ref = 'devel', lib = Sys.getenv('R_LIBS_USER')); \ +remotes::install_github('Vitek-Lab/MSstatsConvert', ref = 'master', lib = Sys.getenv('R_LIBS_USER')); \ +install.packages(c('dplyr', 'stringr', 'ggplot2'), lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org')" + +Rscript benchmark.R \ No newline at end of file