harvardinformatics
diff --git a/‎.githooks/pre-commit
Lines changed: 39 additions & 0 deletions b/‎.githooks/pre-commit
Lines changed: 39 additions & 0 deletions
diff --git a/‎.github/workflows/gh-pages.yml
Lines changed: 10 additions & 2 deletions b/‎.github/workflows/gh-pages.yml
Lines changed: 10 additions & 2 deletions
diff --git a/‎.github/workflows/lychee.toml
Lines changed: 27 additions & 0 deletions b/‎.github/workflows/lychee.toml
Lines changed: 27 additions & 0 deletions
diff --git a/‎docs/resources/Tutorials/add-outgroup-to-whole-genome-alignment-cactus.md
Lines changed: 436 additions & 0 deletions b/‎docs/resources/Tutorials/add-outgroup-to-whole-genome-alignment-cactus.md
Lines changed: 436 additions & 0 deletions
diff --git a/‎docs/resources/Tutorials/add-to-whole-genome-alignment-cactus.md
Lines changed: 14 additions & 14 deletions b/‎docs/resources/Tutorials/add-to-whole-genome-alignment-cactus.md
Lines changed: 14 additions & 14 deletions
diff --git a/‎docs/resources/Tutorials/pangenome-cactus-minigraph.md
Lines changed: 4 additions & 4 deletions b/‎docs/resources/Tutorials/pangenome-cactus-minigraph.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/resources/Tutorials/replace-genome-whole-genome-alignment-cactus.md
Lines changed: 7 additions & 7 deletions b/‎docs/resources/Tutorials/replace-genome-whole-genome-alignment-cactus.md
Lines changed: 7 additions & 7 deletions
@@ -0,0 +1,39 @@
+#!/bin/sh
+# .githooks/pre-commit
+
+echo "Running pre-commit hook via sh..."
+
+FILE="mkdocs.yml"
+TEMP_FILE="$(mktemp)"
+
+# Check if the file exists
+if [ ! -f "$FILE" ]; then
+  echo "Warning: mkdocs.yml not found"
+  exit 0
+fi
+
+# Recomment the development-only ignore line
+# Matches exactly any uncommented line saying "ignore: ['*.ipynb']"
+MODIFIED=0
+while IFS= read -r LINE; do
+  if echo "$LINE" | grep -q "^\s*ignore: \['\*\.ipynb'\]"; then
+    # Extract indent and recomment cleanly
+    INDENT=$(printf "%s" "$LINE" | sed -E 's/^(\s*).*$/\1/')
+    BODY=$(printf "%s" "$LINE" | sed -E 's/^\s*(.*)$/\1/')
+    echo "${INDENT}# ${BODY}" >> "$TEMP_FILE"
+    MODIFIED=1
+  else
+    echo "$LINE" >> "$TEMP_FILE"
+  fi
+done < "$FILE"
+
+if [ "$MODIFIED" -eq 1 ]; then
+  echo "Detected uncommented development-only ignore line in mkdocs.yml"
+  echo "Re-commenting it:       ignore: ['*.ipynb'] → # ignore: ['*.ipynb']"
+  mv "$TEMP_FILE" "$FILE"
+  git add "$FILE"
+else
+  rm "$TEMP_FILE"
+fi
+
+exit 0
@@ -31,11 +31,17 @@ jobs:
       actions: read
     steps:
       # Cache lychee external URL results for 30 days
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
       - name: Download site
         uses: actions/download-artifact@v4
         with:
           name: github-pages
-      - run: tar -xf artifact.tar && rm artifact.tar
+
+      - name: Extract artifact
+        run: tar -xf artifact.tar && rm artifact.tar      
+
       # https://github.com/lycheeverse/lychee-action#utilising-the-cache-feature
       - name: Restore lychee cache
         id: restore-cache
@@ -44,11 +50,13 @@ jobs:
           path: .lycheecache
           key: cache-lychee-${{ github.sha }}
           restore-keys: cache-lychee-
+
       - name: Run lychee
         uses: lycheeverse/lychee-action@v1.8.0
         with:
-          args: "--base . --cache --max-cache-age 30d --max-concurrency 1 --require-https --timeout 5 --exclude-path 'assets/home.html' --exclude 'academic.oup.com/bioinformatics/' --exclude 'useast.ensembl.org' --exclude 'doi.org' --exclude 'academic.oup.com/nar' --exclude 'gnu.org' --exclude 'anaconda.org' --exclude 'fonts.gstatic.com' --exclude 'www.microsoft.com/en-us/microsoft-365/onedrive/online-cloud-storage' --exclude-path 404.html -- './**/*.html' './**/*.css'"
+          args: "--base . --cache --config .github/workflows/lychee.toml -- '.site/**/*.html' '.site/**/*.css'"
           fail: true
+
       - name: Save lychee cache
         uses: actions/cache/save@v3
         if: always()
 
@@ -0,0 +1,27 @@
+# lychee.toml
+
+# Optional: where to store cache
+cache = true
+max_cache_age = "30d"
+max_concurrency = 1
+require_https = true
+timeout = 5
+
+# Exclude full URLs (exact matches)
+exclude = [
+    "https://scholar.google.com",
+    "https://academic.oup.com/bioinformatics/",
+    "https://useast.ensembl.org",
+    "https://doi.org",
+    "https://academic.oup.com/nar",
+    "https://www.gnu.org",
+    "https://anaconda.org",
+    "https://fonts.gstatic.com",
+    "https://www.microsoft.com/en-us/microsoft-365/onedrive/online-cloud-storage",
+]
+
+# Exclude files or paths from checking
+exclude_path = [
+    "assets/home.html",
+    "404.html"
+]
@@ -96,12 +96,12 @@ With that, you should be ready to set-up your data for the pipeline!
 
 ## Inputs you need to prepare
 
-To run this pipeline, you will need:
+To run this pipeline, you will need (corresponding Snakemake config option given in parentheses):
 
-1. A [**HAL file**](https://github.com/ComparativeGenomicsToolkit/Hal) with a whole genome alignment generated by Cactus.
-2. The location in the tree to add your alignment.
-3. The [**softmasked**](#4-how-can-i-tell-if-my-genome-fasta-files-are-softmasked) genome [FASTA](https://en.wikipedia.org/wiki/FASTA_format) file for the genome you want to add to the alignment.
-4. A reference genome to project the alignment to MAF format.
+1. A [**HAL file**](https://github.com/ComparativeGenomicsToolkit/Hal) with a whole genome alignment generated by Cactus (`input_hal`).
+2. The location in the tree to add your alignment (see below).
+3. The [**softmasked**](#4-how-can-i-tell-if-my-genome-fasta-files-are-softmasked) genome [FASTA](https://en.wikipedia.org/wiki/FASTA_format) file for the genome you want to add to the alignment (`new_genome_fasta`).
+4. A reference genome to project the alignment to MAF format (`maf_reference`).
 
 !!! warning "[The FASTA file must softmasked!](https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/progressive.md#interface)"
 
@@ -129,19 +129,19 @@ which would result in:
 
 Now that we have the tree, we need to figure out where to put our new genome. We will need to come up with the following information:
 
-1. A **tip label** or name for our new genome.
-2. The **branch length** of the new branch connecting the new genome to an existing branch.
-3. A **label or name** for the new node in our tree, connecting the new branch to an existing branch.
-4. The **branch** on which to add the new node, defined by a parent and a child node.
-5. The branch on which we add that node will have its length split into two separate branches. We must provide the **top-most** branch length of these two new branches (*i.e.* the one defined by our new node as the child).
+1. A **tip label** or name for our new genome (`new_genome_name`).
+2. The **branch length** of the new branch connecting the new genome to an existing branch (`new_branch_length`).
+3. A **label or name** for the new node in our tree, connecting the new branch to an existing branch (`new_anc_node`).
+4. The **branch** on which to add the new node, defined by a parent (`parent_node`) and a child (`child_node`) node.
+5. The branch on which we add that node will have its length split into two separate branches. We must provide the **top-most** branch length of these two new branches (*i.e.* the one defined by our new node as the child) (`top_branch_length`).
 
 We borrow and slightly modify an [image from the cactus documentation](https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/updating-alignments.md#adding-a-new-genome) to visualize these pieces of information on an example tree:
 
 <center>
     <img src="../../img/cactus-adding-to-branch2-ai.png" alt="Two panels, the first showing a phylogenetic tree with 3 tips and internal nodes labeled, the second showing a 4th tip being added to the tree.">
 </center> 
 
-In this context, we are adding the genome with the name "6" to our HAL. We are adding it such that it branches off from the branch defined by node 4 as the child and node 5 as the parent. To do so, we create a new node, which we come up with a name for (let's say RC for red circle), and a new branch 6-RC. This new RC node splits the 4-5 branch into two new branches: 4-RC and RC-5. For the pipeline you will need to provide the branch length of the **new** 6-RC branch and the **new** of RC-5 (**top-most**) branch.
+In this context, we are adding the genome with the name **"6"** to our HAL. We are adding it such that it branches off from the branch defined by node 4 as the child and node 5 as the parent. To do so, we create a new node, which we come up with a name for (let's say **RC** for red circle), and a new branch 6-RC. This new RC node splits the 4-5 branch into two new branches: 4-RC and RC-5. For the pipeline you will need to provide the branch length of the **new** 6-RC branch and the **new** of RC-5 (**top-most**) branch.
 
 If you're very good at parsing Newick tree strings by eye, you may be able to get this information just by looking at the output of `halStats --tree`. However in most cases, you'll want to look at an image of the tree. Consider using some sort of tree viewing software like [SeaView](https://doua.prabi.fr/software/seaview) or [the ape library in R](https://cran.r-project.org/web/packages/ape/index.html). EMBL also has an [online, interactive tree viewer](https://itol.embl.de/) where you can just paste the tree string to see an image of it.
 
@@ -153,7 +153,7 @@ Once you have the 5 pieces of information from the tree listed above, you're rea
 
 ### Reference sample
 
-In order to run the last step of the workflow that converts the HAL format to a readable MAF format (See [pipeline outputs](#pipeline-outputs) for more info), you will need to select one assembly as a reference assembly. The reference assembly's coordinate system will be used for projection to MAF format. You should indicate the reference assembly in the Snakemake config file (outlined below). For instance, if I wanted my reference sample in the above tree to be the genome labeled **1** in the tree, I would put the string `1` in the `maf_reference:` line of the Snakemake config file.
+In order to run the last step of the workflow that converts the HAL format to a readable MAF format (See [pipeline outputs](#pipeline-outputs) for more info), you will need to select one assembly as a reference assembly. The reference assembly's coordinate system will be used for projection to MAF format. You should indicate the reference assembly in the Snakemake config file (outlined below). For instance, if I wanted my reference sample in the above tree to be the genome labeled **1** in the tree, I would put the string `1` in the `maf_reference` line of the Snakemake config file.
 
 ### Preparing the Snakemake config file
 
@@ -171,7 +171,7 @@ In order to run the last step of the workflow that converts the HAL format to a
 
     The config for the Cactus test data can be found at [here](https://github.com/harvardinformatics/cactus-snakemake/blob/main/tests/evolverMammals/evolverMammals-update-cfg.yaml) or at `tests/evolverMammals/evolverMammals-update-cfg.yaml` in your downloaded cactus-snakemake repo. Be sure to use this as the template for your project since it has all the options needed! **Note: the partitions set in this config file are specific to the Harvard cluster. Be sure to update them if you are running this pipeline elsewhere.**
 
-    Additionally, a blank template file is located [here](https://github.com/harvardinformatics/cactus-snakemake/blob/main/update-config-template.yaml) or at `update-config-template.yaml` in your downloaded cactus-snakemake repo.
+    Additionally, a blank template file is located [here](https://github.com/harvardinformatics/cactus-snakemake/blob/main/config-templates/update-config-template.yaml) or at `config-templates/update-config-template.yaml` in your downloaded cactus-snakemake repo.
 
 Once you have all the information listed above, you can enter it into the Snakemake configuration file along with some other information to know where to look for files and write output. The config file contains 2 sections, one for specifying the input and output options, and one for specifying resources for the various rules (see [below](#specifying-resources-for-each-rule)). The first part should look something like this:
 
@@ -372,7 +372,7 @@ snakemake -j 10 -e slurm -s ../../cactus_update.smk --configfile evolverMammals-
 
 ## Pipeline outputs
 
-The pipeline will output a [.paf](https://github.com/lh3/miniasm/blob/master/PAF.md), a [.hal](https://github.com/ComparativeGenomicsToolkit/hal/blob/master/README.md), and a [.fa](https://en.wikipedia.org/wiki/FASTA_format) file for the new ancestral node as well as the parent node. If you specified `overwrite_original_hal: False` The final alignment file will be `<final_prefix>.hal`, where `<final_prefix>` is whatever you specified in the Snakemake config file. Otherwise, the original HAL will be modified in place.
+The pipeline will output a [.paf](https://github.com/lh3/miniasm/blob/master/PAF.md), a [.hal](https://github.com/ComparativeGenomicsToolkit/hal/blob/master/README.md), and a [.fa](https://en.wikipedia.org/wiki/FASTA_format) file for the new ancestral node as well as the parent node. If you specified `overwrite_original_hal: False` the final alignment file will be `<final_prefix>.hal`, where `<final_prefix>` is whatever you specified in the Snakemake config file. Otherwise, the original HAL will be modified in place.
 
 The final alignment will also be presented in MAF format as `<final_prefix>.<maf_reference>.maf`, again where `<maf_reference>` is whatever you set in the Snakemake config. This file will include all sequences. Another MAF file, `<final_prefix>.<maf_reference>.nodupes.maf` will also be generated, which is the alignment in MAF format with no duplicate sequences. The de-duplicated MAF file is generated with `--dupeMode single`. See the [Cactus documentation regarding MAF export](https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/progressive.md#maf-export) for more info.
 
 
@@ -89,10 +89,10 @@ With that, you should be ready to set-up your data for the pipeline!
 
 ## Inputs you need to prepare
 
-To run this pipeline, you will need:
+To run this pipeline, you will need (corresponding Snakemake config option given in parentheses):
 
-1. The assembled genome [FASTA](https://en.wikipedia.org/wiki/FASTA_format) files for each sample.
-2. A reference sample.
+1. The assembled genome [FASTA](https://en.wikipedia.org/wiki/FASTA_format) files for each sample (specified in `input_file`).
+2. A reference sample (`reference`).
 
 You will use these to create the input file for Cactus-minigraph.
 
@@ -128,7 +128,7 @@ Cactus-minigraph requires that you select one sample as a reference sample [for
 
     The config for the Cactus-minigraph test data can be found at [here](https://github.com/harvardinformatics/cactus-snakemake/blob/main/tests/yeast-minigraph/yeast-minigraph-cfg.yaml) or at `tests/yeast-minigraph/yeast-minigraph-cfg.yaml` in your downloaded cactus-snakemake repo. Be sure to use this as the template for your project since it has all the options needed! **Note: the partitions set in this config file are specific to the Harvard cluster. Be sure to update them if you are running this pipeline elsewhere.**
 
-    Additionally, a blank template file is located [here](https://github.com/harvardinformatics/cactus-snakemake/blob/main/minigraph-config-template.yaml) or at `minigraph-config-template.yaml` in your downloaded cactus-snakemake repo.
+    Additionally, a blank template file is located [here](https://github.com/harvardinformatics/cactus-snakemake/blob/main/config-templates/minigraph-config-template.yaml) or at `config-templates/minigraph-config-template.yaml` in your downloaded cactus-snakemake repo.
 
 Besides the sequence input, the pipeline needs some extra configuration to know where to look for files and write output. That is done in the Snakemake configuration file for a given run. It contains 2 sections, one for specifying the input and output options, and one for specifying resources for the various rules (see [below](#specifying-resources-for-each-rule)). The first part should look something like this:
 
 
@@ -96,12 +96,12 @@ With that, you should be ready to set-up your data for the pipeline!
 
 ## Inputs you need to prepare
 
-To run this pipeline, you will need:
+To run this pipeline, you will need  (corresponding Snakemake config option given in parentheses):
 
-1. A [**HAL file**](https://github.com/ComparativeGenomicsToolkit/Hal) with a whole genome alignment generated by Cactus.
-2. The **name** of the genome you want to replace in the hAL file
-3. The [**softmasked**](#4-how-can-i-tell-if-my-genome-fasta-files-are-softmasked) genome [FASTA](https://en.wikipedia.org/wiki/FASTA_format) file for the genome you want to add to the alignment.
-4. A reference genome to project the alignment to MAF format.
+1. A [**HAL file**](https://github.com/ComparativeGenomicsToolkit/Hal) with a whole genome alignment generated by Cactus (`input_hal`).
+2. The **name** of the genome you want to replace in the HAL file (`replace`).
+3. The [**softmasked**](#4-how-can-i-tell-if-my-genome-fasta-files-are-softmasked) genome [FASTA](https://en.wikipedia.org/wiki/FASTA_format) file for the genome you want to add to the alignment (`new_genome_fasta`).
+4. A reference genome to project the alignment to MAF format (`maf_reference`).
 
 !!! warning "[The FASTA file must softmasked!](https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/progressive.md#interface)"
 
@@ -127,7 +127,7 @@ which would result in:
 ((simHuman_chr6:0.144018,(simMouse_chr6:0.084509,simRat_chr6:0.091589)mr:0.271974)Anc1:0.020593,(simCow_chr6:0.18908,simDog_chr6:0.16303)Anc2:0.032898)Anc0;
 ```
 
-If we want to replace the *simHuman_chr6* genome in our HAL with a new version of the sequence, we would set this label as the value for `replace:` in our Snakemake config file below.
+If we want to replace the *simHuman_chr6* genome in our HAL with a new version of the sequence, we would set this label as the value for `replace` in our Snakemake config file below.
 
 You can also run `halStats --genomes example.hal` to print out the labels without the Newick tree formatting.
 
@@ -155,7 +155,7 @@ In order to run the last step of the workflow that converts the HAL format to a
 
     The config for the Cactus test data can be found at [here](https://github.com/harvardinformatics/cactus-snakemake/blob/main/tests/evolverMammals/evolverMammals-replace-cfg.yaml) or at `tests/evolverMammals/evolverMammals-update-cfg.yaml` in your downloaded cactus-snakemake repo. Be sure to use this as the template for your project since it has all the options needed! **Note: the partitions set in this config file are specific to the Harvard cluster. Be sure to update them if you are running this pipeline elsewhere.**
 
-    Additionally, a blank template file is located [here](https://github.com/harvardinformatics/cactus-snakemake/blob/main/replace-config-template.yaml) or at `replace-config-template.yaml` in your downloaded cactus-snakemake repo.
+    Additionally, a blank template file is located [here](https://github.com/harvardinformatics/cactus-snakemake/blob/main/config-templates/replace-config-template.yaml) or at `config-templates/replace-config-template.yaml` in your downloaded cactus-snakemake repo.
 
 Once you have all the information listed above, you can enter it into the Snakemake configuration file along with some other information to know where to look for files and write output. The config file contains 2 sections, one for specifying the input and output options, and one for specifying resources for the various rules (see [below](#specifying-resources-for-each-rule)). The first part should look something like this: