Setup CI/CD Pipeline for MSStats HPC (#143)

* Setup CI/CD Pipeline for MSStats
Vitek-Lab · Nov 26, 2024 · b21a35e · b21a35e
1 parent f494c87
commit b21a35e
Show file tree

Hide file tree

Showing 3 changed files with 287 additions and 0 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -0,0 +1,77 @@
+name: Run Simple R Script on HPC via Slurm
+
+on:
+  push:
+    branches:
+      - feature/ci-cd-pipeline
+      # - devel
+
+jobs:
+  Benchmarking-pipeline:
+    runs-on: ubuntu-latest
+    env:
+      FDR_THRESHOLD: 0.21
+
+    steps:
+    - name: Checkout Repository
+      uses: actions/checkout@v3
+
+    - name: Set Up SSH Access
+      run: |
+        mkdir -p ~/.ssh
+        touch ~/.ssh/id_rsa
+        chmod 600 ~/.ssh/id_rsa
+        echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
+        ssh-keyscan -H login-00.discovery.neu.edu >> ~/.ssh/known_hosts || exit 1
+
+    - name: Transfer Files to HPC
+      run: |
+        scp benchmark/benchmark.R benchmark/config.slurm raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking || exit 1
+
+    - name: Submit Slurm Job and Capture Job ID
+      id: submit_job
+      run: |
+        ssh raina.ans@login-00.discovery.neu.edu "cd /work/VitekLab/Projects/Benchmarking && sbatch config.slurm" | tee slurm_job_id.txt 
+        slurm_job_id=$(grep -oP '\d+' slurm_job_id.txt) 
+        echo "Slurm Job ID is $slurm_job_id"
+        echo "slurm_job_id=$slurm_job_id" >> $GITHUB_ENV  
+
+    - name: Monitor Slurm Job
+      run: |
+        ssh raina.ans@login-00.discovery.neu.edu "
+          while squeue -j ${{ env.slurm_job_id }} | grep -q ${{ env.slurm_job_id }}; do
+            echo 'Job Id : ${{ env.slurm_job_id }} is still running...'
+            sleep 10
+          done
+          echo 'Job has completed.'
+        "
+
+    - name: Fetch Output
+      run: |
+        scp raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking/job_output.txt job_output.txt
+        scp raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking/job_error.txt job_error.txt
+
+    - name: Extract and Check FDR Values
+      run: |
+        grep -A 4 "FPR  Accuracy    Recall       FDR" job_output.txt > results_section.txt
+        
+        if awk -v threshold="$FDR_THRESHOLD" '
+          NR > 1 {
+            if ($5 > threshold) {
+              print "Error: FDR too high in row " NR " with FDR=" $5
+              exit 1
+            }
+          }
+        ' results_section.txt; then
+          echo "All FDR values are within acceptable range."
+        else
+          exit 1
+        fi
+
+    - name: Upload Output as Artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: benchmark-output
+        path: |
+          job_output.txt
+          job_error.txt
diff --git a/benchmark/benchmark.R b/benchmark/benchmark.R
@@ -0,0 +1,180 @@
+library(MSstatsConvert)
+library(MSstats)
+library(ggplot2)
+library(dplyr)
+library(stringr)
+library(parallel)
+
+
+calculateResult <- function(summarized, label){
+
+  model = groupComparison("pairwise", summarized)
+  comparisonResult <- model$ComparisonResult
+
+  human_comparisonResult <- comparisonResult %>% filter(grepl("_HUMAN$", Protein))
+
+  ecoli_comparisonResult <- comparisonResult %>% filter(grepl("_ECOLI$", Protein))
+
+  yeast_comparisonResult <- comparisonResult %>% filter(grepl("_YEAST$", Protein))
+
+
+  human_median <- median(human_comparisonResult$log2FC, na.rm = TRUE)
+  ecoli_median <- median(ecoli_comparisonResult$log2FC, na.rm = TRUE) 
+  yeast_median <- median(yeast_comparisonResult$log2FC, na.rm = TRUE)
+
+  cat("Expected Log Change Human:", human_median, "\n")
+  cat("Expected Log Change Ecoli:", ecoli_median, "\n")
+  cat("Expected Log Change Yeast:", yeast_median, "\n")
+
+  boxplot(ecoli_comparisonResult$log2FC,
+          main = "Boxplot of log2FC for E. coli",
+          ylab = "log2FC",
+          col = "lightgreen")
+  combined_data <- list(
+    Human = human_comparisonResult$log2FC,
+    Ecoli = ecoli_comparisonResult$log2FC,
+    Yeast = yeast_comparisonResult$log2FC
+  )
+
+
+  unique_ecoli_proteins <- unique(ecoli_comparisonResult$Protein)
+  unique_yeast_proteins <- unique(yeast_comparisonResult$Protein)
+
+  all_proteins <- c(union(unique_ecoli_proteins, unique_yeast_proteins)) 
+
+  extracted_proteins <- sapply(all_proteins, function(x) {
+    split_string <- strsplit(x, "\\|")[[1]]  
+    if (length(split_string) >= 2) {
+      return(split_string[2])  
+    } else {
+      return(NA)  
+    }
+  })
+
+  extracted_proteins <- unname(unlist(extracted_proteins))
+
+  proteins <- c(extracted_proteins)
+
+
+  TP <- comparisonResult %>% filter(grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue < 0.05) %>% nrow()
+
+
+  FP <- comparisonResult %>% filter(!grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue < 0.05) %>% nrow()
+
+
+  TN <- comparisonResult %>% filter(!grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue >= 0.05) %>% nrow()
+
+
+  FN <- comparisonResult %>% filter(grepl(paste(proteins, collapse = "|"), Protein) & adj.pvalue >= 0.05) %>% nrow()
+
+  cat("True Positives (Yeast and EColi): ", TP, "\n")
+  cat("False Positives (Human Samples)", FP, "\n")
+  cat("True Negatives", TN, "\n")
+  cat("False Negatives", FN, "\n")
+
+  ecoli_comparisonResult %>% 
+      filter(is.finite(log2FC)) %>% 
+      ggplot() + geom_boxplot(aes(y=log2FC)) +
+      geom_hline(aes(yintercept=-2), linetype="dashed", color="red", linewidth=2) +
+      theme_bw() + 
+      labs(title = "Boxplot of log2FC for E. coli",
+           y = "log2FC")
+
+  human_comparisonResult %>% 
+      filter(is.finite(log2FC)) %>% 
+      ggplot() + geom_boxplot(aes(y=log2FC)) +
+      geom_hline(aes(yintercept=-2), linetype="dashed", color="blue", linewidth=2) +
+      theme_bw() + 
+      labs(title = "Boxplot of log2FC for Human",
+           y = "log2FC")
+
+
+  yeast_comparisonResult %>% 
+      filter(is.finite(log2FC)) %>% 
+      ggplot() + geom_boxplot(aes(y=log2FC)) +
+      geom_hline(aes(yintercept=-2), linetype="dashed", color="green", linewidth=2) +
+      theme_bw() + 
+      labs(title = "Boxplot of log2FC for Yeast",
+           y = "log2FC")
+
+  FPR <- FP / (FP + TN)
+
+  # Accuracy
+  accuracy <- (TP + TN) / (TP + TN + FP + FN)
+
+  # Recall
+  recall <- TP / (TP + FN)
+
+  # False Discover Rate (FDR)
+
+  fdr <- FP/ (FP + TP)
+
+  results <- data.frame(
+    Label = label,
+    TP = TP,
+    FP = FP,
+    TN = TN,
+    FN = FN,
+    FPR = FPR,
+    Accuracy = accuracy,
+    Recall = recall,
+    FDR = fdr
+  )
+
+  return(results)
+
+}
+
+start_time <- Sys.time()
+
+fragpipe_raw = data.table::fread("/work/VitekLab/Data/MS/Benchmarking/DDA-Puyvelde2022/DDA-Puyvelde2022-HYE5600735_LFQ/FragPipe/TOP0/MSstats.csv")
+
+head(fragpipe_raw)
+
+fragpipe_raw$Condition = unlist(lapply(fragpipe_raw$Run, function(x){
+  paste(str_split(x, "\\_")[[1]][4:5], collapse="_")
+}))
+
+fragpipe_raw$BioReplicate = unlist(lapply(fragpipe_raw$Run, function(x){
+  paste(str_split(x, "\\_")[[1]][4:7], collapse="_")
+}))
+
+msstats_format = MSstatsConvert::FragPipetoMSstatsFormat(fragpipe_raw, use_log_file = FALSE)
+
+
+data_process_tasks <- list(
+  list(
+    label = "Data process with Normalized Data",
+    result = function() dataProcess(msstats_format, featureSubset = "topN", n_top_feature = 20)
+  ),
+  list(
+    label = "Data process with Normalization and MBImpute False",
+    result = function() dataProcess(msstats_format, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE)
+  ),
+  list(
+    label = "Data process without Normalization",
+    result = function() dataProcess(msstats_format, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20)
+  ),
+  list(
+    label = "Data process without Normalization with MBImpute False",
+    result = function() dataProcess(msstats_format, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE)
+  )
+)
+
+start_time <- Sys.time()
+
+num_cores <- detectCores() - 1 
+
+summarized_results <- mclapply(data_process_tasks, function(task) {
+  list(label = task$label, summarized = task$result())
+}, mc.cores = num_cores)
+
+results_list <- mclapply(summarized_results, function(res) {
+  calculateResult(res$summarized, res$label)
+}, mc.cores = num_cores)
+
+final_results <- do.call(rbind, results_list)
+end_time <- Sys.time()
+total_time <- end_time - start_time
+print(final_results)
+print(paste("Total Execution Time:", total_time))
diff --git a/benchmark/config.slurm b/benchmark/config.slurm
@@ -0,0 +1,30 @@
+#!/bin/bash
+#SBATCH --job-name=msstats_benchmark_job
+#SBATCH --chdir=/work/VitekLab/Projects/Benchmarking/
+#SBATCH --output=job_output.txt
+#SBATCH --error=job_error.txt
+#SBATCH --time=01:00:00         # Set the maximum run time
+#SBATCH --ntasks=1              # Number of tasks (one process)
+#SBATCH --cpus-per-task=8      # Use 8 CPU cores for the task
+#SBATCH --mem=256G              # Request 256GB of memory
+#SBATCH --partition=short       # Use the 'short' partition (or change as needed)
+
+module load R-geospatial
+
+
+module load gcc/11.1.0
+module load cmake/3.23.2
+
+export LC_ALL=C
+export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.2-geospatial
+
+
+mkdir -p $R_LIBS_USER
+
+module load R
+Rscript -e "if (!requireNamespace('remotes', quietly = TRUE)) install.packages('remotes', lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org'); \
+remotes::install_github('Vitek-Lab/MSstats', ref = 'devel', lib = Sys.getenv('R_LIBS_USER')); \
+remotes::install_github('Vitek-Lab/MSstatsConvert', ref = 'master', lib = Sys.getenv('R_LIBS_USER')); \
+install.packages(c('dplyr', 'stringr', 'ggplot2'), lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org')"
+
+Rscript benchmark.R