From 11dab3b3e80b7d9cf202ba7385728809971b650e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=85sa=20Bj=C3=B6rklund?= <asa.bjorklund@scilifelab.se>
Date: Tue, 6 Feb 2024 15:10:11 +0100
Subject: [PATCH] minor updates to seurat labs

---
 labs/_metadata.yml                    |  6 ++++
 labs/seurat/seurat_03_integration.qmd | 12 +++----
 labs/seurat/seurat_05_dge.qmd         | 49 +++++++++++++++++++++++++++
 labs/seurat/seurat_06_celltyping.qmd  | 47 +++++++++++++++++++++++++
 4 files changed, 108 insertions(+), 6 deletions(-)

diff --git a/labs/_metadata.yml b/labs/_metadata.yml
index 3a66e8e4..6e8fb3b8 100644
--- a/labs/_metadata.yml
+++ b/labs/_metadata.yml
@@ -320,8 +320,14 @@ ct_scpred_3: |
   Now, let's predict celltypes on our data, where scPred will align the two datasets with Harmony and then perform classification.
 ct_scpred_4: "Now plot how many	cells of each celltypes	can be found in	each cluster."
 
+ct_azimuth: "Azimuth"
+ct_azimuth_1: "There are multiple online resources with large curated datasets with methods to integrate and do label transfer. One such resource is [Azimuth](https://azimuth.hubmapconsortium.org/) another one is [Disco](https://www.immunesinglecell.org/)."
+ct_azimuth_2: "Azimuth is also possible to install and run locally, but in this case we have used the online app and ran the predictions for you. So you just have to download the prediction file."
+
 ct_compare: "Compare results"
 ct_compare_1: "Now we will compare the output of the two methods using the convenient function in scPred `crossTab()` that prints the overlap between two metadata slots."
+ct_compare_2: "We can also plot all the different predictions side by side"
+
 
 ct_gsea: "GSEA with celltype markers"
 ct_gsea_1: |
diff --git a/labs/seurat/seurat_03_integration.qmd b/labs/seurat/seurat_03_integration.qmd
index f4d86b1c..4ae1f794 100644
--- a/labs/seurat/seurat_03_integration.qmd
+++ b/labs/seurat/seurat_03_integration.qmd
@@ -180,11 +180,10 @@ myfeatures <- c("CD3E", "CD4", "CD8A", "NKG7", "GNLY", "MS4A1", "CD14", "LYZ", "
 FeaturePlot(alldata.int, reduction = "umap", dims = 1:2, features = myfeatures, ncol = 4, order = T) + NoLegend() + NoAxes() + NoGrid()
 ```
 
-## Harmony
 
-An alternative method for integration is Harmony, for more details on the method, please see their paper  [Nat. Methods](https://www.nature.com/articles/s41592-019-0619-0). 
+## {{< meta dimred_harmony >}}
 
-This method runs integration on a dimensionality reduction, in most applications the PCA. So first, we will rerun scaling and PCA with the same set of genes that were used for the CCA integration.
+{{< meta dimred_harmony_1 >}}
 
 OBS! Make sure to revert back to the `RNA` assay.
 
@@ -219,11 +218,12 @@ alldata.int <- RunUMAP(alldata.int, dims = 1:50, reduction = "harmony", reductio
 DimPlot(alldata.int, reduction = "umap_harmony", group.by = "orig.ident") + NoAxes() + ggtitle("Harmony UMAP")
 ```
 
-## Scanorama
+## {{< meta dimred_scanorama >}}
 
-Another integration method is Scanorama (see [Nat. Biotech.](https://www.nature.com/articles/s41587-019-0113-3)). This method is implemented in python, but we can run it through the Reticulate package.
+{{< meta dimred_scanorama_1 >}}
+
+{{< meta dimred_scanorama_2 >}}
 
-We will use the split list of samples and transpose the matrix as scanorama requires rows as samples and columns as genes. We also create a list with the highly variables genes for each sample.
 
 ```{r}
 #| fig-height: 5
diff --git a/labs/seurat/seurat_05_dge.qmd b/labs/seurat/seurat_05_dge.qmd
index fb9657b1..cd93748d 100644
--- a/labs/seurat/seurat_05_dge.qmd
+++ b/labs/seurat/seurat_05_dge.qmd
@@ -142,6 +142,55 @@ VlnPlot(alldata, features = as.character(unique(top3$gene)), ncol = 5, group.by
 Take a screenshot of those results and re-run the same code above with another test: "wilcox" (Wilcoxon Rank Sum test), "bimod" (Likelihood-ratio test), "roc" (Identifies 'markers' of gene expression using ROC analysis),"t" (Student's t-test),"negbinom" (negative binomial generalized linear model),"poisson" (poisson generalized linear model), "LR" (logistic regression), "MAST" (hurdle model), "DESeq2" (negative binomial distribution).
 :::
 
+### DGE with equal amount of cells
+
+The number of cells per cluster differ quite a bit in this data
+
+```{r}
+table(alldata@active.ident)
+```
+
+Hence when we run `FindAllMarkers` one cluster vs rest, the largest cluster (cluster 0) will dominate the "rest" and influence the results the most. So it is often a good idea to subsample the clusters to an equal number of cells before running differential expression for one vs rest. So lets select 300 cells per cluster:
+
+```{r}
+sub <- subset(alldata, cells = WhichCells(alldata, downsample = 300))
+table(sub@active.ident)
+```
+Now rerun `FindAllMarkers` with this set and compare the results.
+
+```{r}
+markers_genes_sub <- FindAllMarkers(
+    sub,
+    log2FC.threshold = 0.2,
+    test.use = "wilcox",
+    min.pct = 0.1,
+    min.diff.pct = 0.2,
+    only.pos = TRUE,
+    max.cells.per.ident = 50,
+    assay = "RNA"
+)
+```
+The number of significant genes per cluster has changed, with more for some clusters and less for others. 
+
+```{r}
+table(markers_genes$cluster)
+table(markers_genes_sub$cluster)
+```
+
+
+
+```{r}
+#| fig-height: 8
+#| fig-width: 8
+
+markers_genes_sub %>%
+    group_by(cluster) %>%
+    slice_min(p_val_adj, n = 5, with_ties = FALSE) -> top5_sub
+
+DotPlot(alldata, features = rev(as.character(unique(top5_sub$gene))), group.by = sel.clust, assay = "RNA") + coord_flip()
+```
+
+
 ## {{< meta dge_cond >}}
 
 {{< meta dge_cond_1 >}}
diff --git a/labs/seurat/seurat_06_celltyping.qmd b/labs/seurat/seurat_06_celltyping.qmd
index 5d9efe81..aeb35275 100644
--- a/labs/seurat/seurat_06_celltyping.qmd
+++ b/labs/seurat/seurat_06_celltyping.qmd
@@ -178,6 +178,41 @@ ggplot(ctrl@meta.data, aes(x = CCA_snn_res.0.5, fill = scpred_prediction)) +
     theme_classic()
 ```
 
+
+
+## {{< meta ct_azimuth >}}
+
+{{< meta ct_azimuth_1 >}}
+
+{{< meta ct_azimuth_2 >}}
+
+This is how to save the count matrix:
+
+```{r}
+#| eval: false
+
+# No need to run now.
+C = ctrl@assays$RNA@counts
+saveRDS(C, file = "data/covid/results/ctrl13_count_matrix.rds")
+```  
+
+Instead load the results and visualize:
+
+```{r}
+path_data <- "https://export.uppmax.uu.se/naiss2023-23-3/workshops/workshop-scrnaseq"
+path_file <- "data/covid/results/azimuth_pred.tsv"
+if (!dir.exists(dirname(path_file))) dir.create(dirname(path_file), recursive = TRUE)
+if (fetch_data && !file.exists(path_file)) download.file(url = file.path(path_data, "covid/results/azimuth_pred.tsv"), destfile = path_file)
+
+azimuth_pred <- read.table(path_file, sep = "\t", header = T)
+
+# add predictions to the seurat object
+ctrl$azimuth = azimuth_pred$predicted.celltype.l2
+DimPlot(ctrl, group.by = "azimuth", label = T, repel = T) + NoAxes()
+
+```
+
+
 ## {{< meta ct_compare >}}
 
 {{< meta ct_compare_1 >}}
@@ -186,6 +221,17 @@ ggplot(ctrl@meta.data, aes(x = CCA_snn_res.0.5, fill = scpred_prediction)) +
 crossTab(ctrl, "predicted.id", "scpred_prediction")
 ```
 
+{{< meta ct_compare_2 >}}
+
+```{r}
+wrap_plots(
+    DimPlot(ctrl, label = T, group.by = "predicted.id") + NoAxes(),
+    DimPlot(ctrl, label = T, group.by = "scpred_prediction") + NoAxes(),
+    DimPlot(ctrl, label = T, group.by = "azimuth") + NoAxes(),
+    ncol = 2
+)
+```
+
 ## {{< meta ct_gsea >}}
 
 {{< meta ct_gsea_1 >}}
@@ -379,6 +425,7 @@ wrap_plots(
 )
 ```
 
+
 :::{.callout-note title="Discuss"}
 {{< meta ct_gsea_annot_2 >}}
 :::