add CSD3 scripts

cambiotraining · Jun 28, 2024 · ed39712 · ed39712
1 parent cc4b5d6
commit ed39712
Show file tree

Hide file tree

Showing 45 changed files with 517 additions and 204 deletions.
diff --git a/.gitignore b/.gitignore
@@ -28,6 +28,6 @@ slides
 notes
 
 # specific files from course_materials that we do not version control
-course_files/data
-course_files/logs
-course_files/results
+course_files/**/data
+course_files/**/logs
+course_files/**/results
diff --git a/...se_files/dependency/notok/submit_notok.sh → ...iles/aws/dependency/notok/submit_notok.sh b/...se_files/dependency/notok/submit_notok.sh → ...iles/aws/dependency/notok/submit_notok.sh
diff --git a/...dependency/notok/task_with_checkpoints.sh → ...dependency/notok/task_with_checkpoints.sh b/...dependency/notok/task_with_checkpoints.sh → ...dependency/notok/task_with_checkpoints.sh
diff --git a/course_files/dependency/ok/submit_ok.sh → course_files/aws/dependency/ok/submit_ok.sh b/course_files/dependency/ok/submit_ok.sh → course_files/aws/dependency/ok/submit_ok.sh
diff --git a/course_files/dependency/ok/task1.sh → course_files/aws/dependency/ok/task1.sh b/course_files/dependency/ok/task1.sh → course_files/aws/dependency/ok/task1.sh
diff --git a/course_files/dependency/ok/task2.sh → course_files/aws/dependency/ok/task2.sh b/course_files/dependency/ok/task2.sh → course_files/aws/dependency/ok/task2.sh
diff --git a/.../dependency/singleton/submit_singleton.sh → .../dependency/singleton/submit_singleton.sh b/.../dependency/singleton/submit_singleton.sh → .../dependency/singleton/submit_singleton.sh
diff --git a/...s/dependency/singleton/task1_singleton.sh → ...s/dependency/singleton/task1_singleton.sh b/...s/dependency/singleton/task1_singleton.sh → ...s/dependency/singleton/task1_singleton.sh
diff --git a/...s/dependency/singleton/task2_singleton.sh → ...s/dependency/singleton/task2_singleton.sh b/...s/dependency/singleton/task2_singleton.sh → ...s/dependency/singleton/task2_singleton.sh
diff --git a/...s/dependency/singleton/task3_singleton.sh → ...s/dependency/singleton/task3_singleton.sh b/...s/dependency/singleton/task3_singleton.sh → ...s/dependency/singleton/task3_singleton.sh
diff --git a/course_files/scripts/pi_estimator.R → course_files/aws/scripts/pi_estimator.R b/course_files/scripts/pi_estimator.R → course_files/aws/scripts/pi_estimator.R
diff --git a/course_files/scripts/sir_plotter.py → course_files/aws/scripts/sir_plotter.py b/course_files/scripts/sir_plotter.py → course_files/aws/scripts/sir_plotter.py
diff --git a/course_files/scripts/sir_simulator.py → course_files/aws/scripts/sir_simulator.py b/course_files/scripts/sir_simulator.py → course_files/aws/scripts/sir_simulator.py
diff --git a/course_files/scripts/turing_pattern.py → course_files/aws/scripts/turing_pattern.py b/course_files/scripts/turing_pattern.py → course_files/aws/scripts/turing_pattern.py
diff --git a/...files/slurm/drosophila_genome_indexing.sh → ...s/aws/slurm/drosophila_genome_indexing.sh b/...files/slurm/drosophila_genome_indexing.sh → ...s/aws/slurm/drosophila_genome_indexing.sh
diff --git a/course_files/slurm/estimate_pi.sh → course_files/aws/slurm/estimate_pi.sh b/course_files/slurm/estimate_pi.sh → course_files/aws/slurm/estimate_pi.sh
diff --git a/course_files/slurm/parallel_arrays.sh → course_files/aws/slurm/parallel_arrays.sh b/course_files/slurm/parallel_arrays.sh → course_files/aws/slurm/parallel_arrays.sh
diff --git a/...iles/slurm/parallel_drosophila_mapping.sh → .../aws/slurm/parallel_drosophila_mapping.sh b/...iles/slurm/parallel_drosophila_mapping.sh → .../aws/slurm/parallel_drosophila_mapping.sh
diff --git a/course_files/slurm/parallel_estimate_pi.sh → ...e_files/aws/slurm/parallel_estimate_pi.sh b/course_files/slurm/parallel_estimate_pi.sh → ...e_files/aws/slurm/parallel_estimate_pi.sh
diff --git a/...se_files/slurm/parallel_turing_pattern.sh → ...iles/aws/slurm/parallel_turing_pattern.sh b/...se_files/slurm/parallel_turing_pattern.sh → ...iles/aws/slurm/parallel_turing_pattern.sh
diff --git a/course_files/slurm/plot_sir.sh → course_files/aws/slurm/plot_sir.sh b/course_files/slurm/plot_sir.sh → course_files/aws/slurm/plot_sir.sh
diff --git a/course_files/slurm/seqkit_singularity.sh → course_files/aws/slurm/seqkit_singularity.sh b/course_files/slurm/seqkit_singularity.sh → course_files/aws/slurm/seqkit_singularity.sh
diff --git a/course_files/slurm/simple_job.sh → course_files/aws/slurm/simple_job.sh b/course_files/slurm/simple_job.sh → course_files/aws/slurm/simple_job.sh
diff --git a/course_files/slurm/simulate_sir.sh → course_files/aws/slurm/simulate_sir.sh b/course_files/slurm/simulate_sir.sh → course_files/aws/slurm/simulate_sir.sh
diff --git a/course_files/csd3/dependency/notok/submit_notok.sh b/course_files/csd3/dependency/notok/submit_notok.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# This is not submitted to SLURM!
+# These are the sbatch commands we are using to submit our jobs
+
+# first submission
+run1_id=$(sbatch --parsable task_with_checkpoints.sh)
+
+# second submission in case the first one fails
+run2_id=$(sbatch --parsable --dependency afternotok:${run1_id} task_with_checkpoints.sh)
+
+# submit a third time in case the second fails
+run3_id=$(sbatch --parsable --dependency afternotok:${run2_id} task_with_checkpoints.sh)
+
+# etc... we could continue submitting more
+# but it's probably good to stop after a while
+# and check if our job finally completed or not
diff --git a/course_files/csd3/dependency/notok/task_with_checkpoints.sh b/course_files/csd3/dependency/notok/task_with_checkpoints.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+#SBATCH -p icelake  # name of the partition to run job on
+#SBATCH -o logs/task_with_checkpoints_%j.log
+#SBATCH -c 1        # number of CPUs. Default: 1
+#SBATCH --mem=1G    # RAM memory. Default: 1G
+#SBATCH -t 00:01:00 # time for the job HH:MM:SS. Default: 1 min
+
+# output file name
+checkpoint="checkpoint.txt"
+finalresult="long_task_result.txt"
+
+# the code below this is a bit silly and you don't need to worry about its details
+# we are simply incrementing a number by 1 every 15 seconds
+# when that number reaches 10, we consider the job finished
+# at each stage the current number is saved in the checkpoint file
+# so if the job fails we resume it from that point
+
+#### incrementer-with-checkpoint ####
+
+# Check if checkpoint file exists
+if [ -f "$checkpoint" ]; then
+    # if does, read the number from the file
+    number=$(<"$checkpoint")
+else
+    # if it doesn't, initiate it
+    number=0
+fi
+
+# loop through every 15 seconds
+for i in $(seq $(( $number + 1)) 10)
+do
+  # increment after 15 seconds
+  sleep 15
+  ((number++))
+
+  # write to checkpoint
+  echo "$number" > "$checkpoint"  
+done
+
+# result
+echo "Congratulations, you have counted to 10." > "$finalresult"
+
+# message to log file
+echo "Job complete, removing checkpoint.txt file."
+rm $checkpoint
diff --git a/course_files/csd3/dependency/ok/submit_ok.sh b/course_files/csd3/dependency/ok/submit_ok.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# This is not submitted to SLURM!
+# These are the sbatch commands we are using to submit our jobs
+
+# first task of our pipeline
+run1_id=$(sbatch --parsable task1.sh)
+
+# second task of our pipeline - only runs if the previous was successful
+sbatch --dependency afterok:${run1_id} task2.sh
diff --git a/course_files/csd3/dependency/ok/task1.sh b/course_files/csd3/dependency/ok/task1.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+#SBATCH -p icelake  # name of the partition to run job on
+#SBATCH -o logs/task1_%j.log
+#SBATCH -c 1        # number of CPUs. Default: 1
+#SBATCH --mem=1G    # RAM memory. Default: 1G
+#SBATCH -t 00:02:00 # time for the job HH:MM:SS. Default: 1 min
+
+# sleep for 60 seconds (to have time to see the job in the queue)
+sleep 60
+
+# create an example file
+touchh output_task1.txt
diff --git a/course_files/csd3/dependency/ok/task2.sh b/course_files/csd3/dependency/ok/task2.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH -p icelake  # name of the partition to run job on
+#SBATCH -o logs/task2_%j.log
+#SBATCH -c 1        # number of CPUs. Default: 1
+#SBATCH --mem=1G    # RAM memory. Default: 1G
+#SBATCH -t 00:01:00 # time for the job HH:MM:SS. Default: 1 min
+
+# sleep for 10 seconds (to have time to see the job in the queue)
+sleep 10
+
+# rename file from previous task
+# which requires task1 to have completed successfully
+mv output_task1.txt output_task2.txt
diff --git a/course_files/csd3/dependency/singleton/submit_singleton.sh b/course_files/csd3/dependency/singleton/submit_singleton.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# This is not submitted to SLURM!
+# These are the sbatch commands we are using to submit our jobs
+
+# first two tasks of our pipeline - none have dependencies
+sbatch -J my_pipeline task1_singleton.sh
+sbatch -J my_pipeline task2_singleton.sh
+
+# the third task requires all previous ones with the same "job name" to have finished
+sbatch -J my_pipeline --dependency singleton task3_singleton.sh
diff --git a/course_files/csd3/dependency/singleton/task1_singleton.sh b/course_files/csd3/dependency/singleton/task1_singleton.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH -p icelake  # name of the partition to run job on
+#SBATCH -J my_pipeline  # name for the job
+#SBATCH -o logs/task1_singleton_%j.log
+#SBATCH -c 1        # number of CPUs. Default: 1
+#SBATCH --mem=1G    # RAM memory. Default: 1G
+#SBATCH -t 00:01:00 # time for the job HH:MM:SS. Default: 1 min
+
+# sleep for 10 seconds (to have time to see the job in the queue)
+sleep 10
+
+# create a file
+echo "Output from task1" > result_task1.txt
diff --git a/course_files/csd3/dependency/singleton/task2_singleton.sh b/course_files/csd3/dependency/singleton/task2_singleton.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH -p icelake  # name of the partition to run job on
+#SBATCH -J my_pipeline  # name for the job
+#SBATCH -o logs/task2_singleton_%j.log
+#SBATCH -c 1        # number of CPUs. Default: 1
+#SBATCH --mem=1G    # RAM memory. Default: 1G
+#SBATCH -t 00:01:00 # time for the job HH:MM:SS. Default: 1 min
+
+# sleep for 10 seconds (to have time to see the job in the queue)
+sleep 10
+
+# create a file
+echo "Output from task2" > result_task2.txt
diff --git a/course_files/csd3/dependency/singleton/task3_singleton.sh b/course_files/csd3/dependency/singleton/task3_singleton.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+#SBATCH -p icelake  # name of the partition to run job on
+#SBATCH -J my_pipeline  # name for the job
+#SBATCH -o logs/task3_singleton_%j.log
+#SBATCH -c 1        # number of CPUs. Default: 1
+#SBATCH --mem 1G    # RAM memory. Default: 1G
+#SBATCH -t 00:01:00 # time for the job HH:MM:SS. Default: 1 min
+#SBATCH --dependency singleton
+
+# sleep for 10 seconds (to have time to see the job in the queue)
+sleep 10
+
+# concatenate previous two files into one
+cat result_task1.txt result_task2.txt > result_task3.txt
diff --git a/course_files/csd3/scripts/pi_estimator.R b/course_files/csd3/scripts/pi_estimator.R
@@ -0,0 +1,65 @@
+
+
+# User Arguments ----------------------------------------------------------
+
+suppressPackageStartupMessages(library("argparse"))
+
+# create parser object
+parser <- ArgumentParser()
+
+# specify our desired options
+# by default ArgumentParser will add an help option
+parser$add_argument("--ncpus", type = "integer", default = 1,
+                    help="number of CPUs used for calculation. Default: %(default)s")
+parser$add_argument("--nsamples", type="integer", default = 10,
+                    help="Number of points to sample for estimation in millions. Default: %(default)s",
+                    metavar="number")
+
+# parse arguments
+args <- parser$parse_args()
+
+
+# Functions ---------------------------------------------------------------
+
+# split a number into N parts
+# https://www.geeksforgeeks.org/split-the-number-into-n-parts-such-that-difference-between-the-smallest-and-the-largest-part-is-minimum/
+split <- function(x, n){
+  if(x %% n == 0) {
+    out <- rep(floor(x/n), n)
+  } else {
+    # upto n-(x % n) the values
+    # will be x / n
+    # after that the values
+    # will be x / n + 1
+    zp = n - (x %% n)
+    pp = floor(x/n)
+    out <- 1:n
+    out <- ifelse(out > zp, pp + 1, pp)
+  }
+  out
+}
+
+# count points inside a circle
+inside_circle <- function(total_count){
+  x <- runif(total_count)
+  y <- runif(total_count)
+  radii <- sqrt(x*x + y*y)
+  count <- length(radii[which(radii <= 1)])
+  count
+}
+
+# Estimate Pi ---------------------
+
+# grab user options
+n_samples <- ceiling(args$nsamples*1e6)
+ncpus <- args$ncpus
+
+results <- parallel::mclapply(split(n_samples, ncpus), inside_circle)
+results <- unlist(results)
+
+counts <- sum(results)
+my_pi <- 4*counts/n_samples
+
+# print to standard output
+cat(my_pi, "\n")
+
diff --git a/course_files/csd3/scripts/turing_pattern.py b/course_files/csd3/scripts/turing_pattern.py
@@ -0,0 +1,135 @@
+# Author: Benjamin F. Maier
+# https://github.com/benmaier/reaction-diffusion
+# Adapted by: Hugo Tavares
+
+# import necessary libraries
+import numpy as np
+import matplotlib.pyplot as plt
+import argparse
+
+
+# ============ capture user input =============
+
+parser = argparse.ArgumentParser(description='Reaction diffusion models.')
+parser.add_argument('-f', '--feed', type=float, default=0.04,
+                    help='The "feed rate" parameter of the model. Default: 0.04')
+parser.add_argument('-k', '--kill', type=float, default=0.06,
+                    help='The "kill rate" parameter of the model. Default: 0.06')
+parser.add_argument('-o', '--outdir', type=str, default=".",
+                    help='Output directory. Default: .')
+
+args = parser.parse_args()
+
+
+# ============ define relevant functions =============
+
+# an efficient function to compute a mean over neighboring cells
+def apply_laplacian(mat):
+    """This function applies a discretized Laplacian
+    in periodic boundary conditions to a matrix
+    For more information see 
+    https://en.wikipedia.org/wiki/Discrete_Laplace_operator#Implementation_via_operator_discretization
+    """
+
+    # the cell appears 4 times in the formula to compute
+    # the total difference
+    neigh_mat = -4*mat.copy()
+
+    # Each direct neighbor on the lattice is counted in
+    # the discrete difference formula
+    neighbors = [ 
+                    ( 1.0,  (-1, 0) ),
+                    ( 1.0,  ( 0,-1) ),
+                    ( 1.0,  ( 0, 1) ),
+                    ( 1.0,  ( 1, 0) ),
+                ]
+
+    # shift matrix according to demanded neighbors
+    # and add to this cell with corresponding weight
+    for weight, neigh in neighbors:
+        neigh_mat += weight * np.roll(mat, neigh, (0,1))
+
+    return neigh_mat
+
+# Define the update formula for chemicals A and B
+def update(A, B, DA, DB, f, k, delta_t):
+    """Apply the Gray-Scott update formula"""
+
+    # compute the diffusion part of the update
+    diff_A = DA * apply_laplacian(A)
+    diff_B = DB * apply_laplacian(B)
+
+    # Apply chemical reaction
+    reaction = A*B**2
+    diff_A -= reaction
+    diff_B += reaction
+
+    # Apply birth/death
+    diff_A += f * (1-A)
+    diff_B -= (k+f) * B
+
+    A += diff_A * delta_t
+    B += diff_B * delta_t
+
+    return A, B
+
+def get_initial_A_and_B(N, random_influence = 0.2):
+    """get the initial chemical concentrations"""
+
+    # get initial homogeneous concentrations
+    A = (1-random_influence) * np.ones((N,N))
+    B = np.zeros((N,N))
+
+    # put some noise on there
+    A += random_influence * np.random.random((N,N))
+    B += random_influence * np.random.random((N,N))
+
+    # get center and radius for initial disturbance
+    N2, r = N//2, 50
+
+    # apply initial disturbance
+    A[N2-r:N2+r, N2-r:N2+r] = 0.50
+    B[N2-r:N2+r, N2-r:N2+r] = 0.25
+
+    return A, B
+
+def draw(A, B):
+    """return the matplotlib artists for animation"""
+    fig, ax = pl.subplots(1,2,figsize=(5.65,3))
+    imA = ax[0].imshow(A, animated=True,vmin=0,cmap='Greys')
+    imB = ax[1].imshow(B, animated=True,vmax=1,cmap='Greys')
+    ax[0].axis('off')
+    ax[1].axis('off')
+    ax[0].set_title('A')
+    ax[1].set_title('B')
+
+    return fig, imA, imB
+
+
+# =========== define model parameters ==========
+
+# update in time
+delta_t = 1.0
+
+# Diffusion coefficients
+DA = 0.16
+DB = 0.08
+
+# define birth/death rates
+f = args.feed
+k = args.kill
+
+# grid size
+N = 200
+
+# intialize the chemical concentrations
+A, B = get_initial_A_and_B(N)
+
+N_simulation_steps = 10000
+for step in range(N_simulation_steps):
+    A, B = update(A, B, DA, DB, f, k, delta_t)
+
+# make plot
+plt.imshow(A)
+plt.axis('off')
+plt.savefig("{}/f{}_k{}.png".format(args.outdir, args.feed, args.kill))
diff --git a/course_files/csd3/slurm/drosophila_genome_indexing.sh b/course_files/csd3/slurm/drosophila_genome_indexing.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+#SBATCH -A TRAINING-CPU
+#SBATCH --reservation=training
+#SBATCH -p icelake  # name of the partition to run job on
+#SBATCH -D /home/FIX-YOUR-USERNAME/rds/hpc-work/hpc_workshop/  # working directory
+#SBATCH -o logs/drosophila_genome_indexing.log
+#SBATCH -c 1        # number of CPUs. Default: 1
+#SBATCH --mem=1G    # RAM memory. Default: 1G
+#SBATCH -t 00:10:00 # time for the job HH:MM:SS. Default: 1 min
+
+# these lines are needed to source the mamba activate command
+# include them if you want to activate environments in your script
+eval "$(conda shell.bash hook)"
+source $CONDA_PREFIX/etc/profile.d/mamba.sh
+
+# activate conda environment
+FIXME
+
+# make an output directory for the index
+mkdir -p results/drosophila/genome
+
+# index the reference genome with bowtie2; the syntax is:
+# bowtie2-build input.fa output_prefix
+bowtie2-build data/genome/drosophila_genome.fa results/drosophila/genome/index