Merge pull request #35 from NBISweden/master

All changes for 2025 version of course
NBISweden · Feb 28, 2025 · 7bdfbe0 · 7bdfbe0
2 parents fa27841 + 1123596
commit 7bdfbe0
Show file tree

Hide file tree

Showing 734 changed files with 42,197 additions and 9,599 deletions.
diff --git a/compiled/labs/bioc/bioc_01_qc.qmd b/compiled/labs/bioc/bioc_01_qc.qmd
diff --git a/compiled/labs/bioc/bioc_02_dimred.qmd b/compiled/labs/bioc/bioc_02_dimred.qmd
@@ -20,6 +20,7 @@ First, let's load all necessary libraries and the QC-filtered dataset
 from the previous step.
 
 ``` {r}
+#| label: libraries
 suppressPackageStartupMessages({
     library(scater)
     library(scran)
@@ -30,14 +31,19 @@ suppressPackageStartupMessages({
 ```
 
 ``` {r}
+#| label: fetch-data
+
 # download pre-computed data if missing or long compute
 fetch_data <- TRUE
 
 # url for source and intermediate data
-path_data <- "https://export.uppmax.uu.se/naiss2023-23-3/workshops/workshop-scrnaseq"
+path_data <- "https://nextcloud.dc.scilifelab.se/public.php/webdav"
+curl_upass <- "-u zbC5fr2LbEZ9rSE:scRNAseq2025"
+
 path_file <- "data/covid/results/bioc_covid_qc.rds"
 if (!dir.exists(dirname(path_file))) dir.create(dirname(path_file), recursive = TRUE)
-if (fetch_data && !file.exists(path_file)) download.file(url = file.path(path_data, "covid/results/bioc_covid_qc.rds"), destfile = path_file)
+if (fetch_data && !file.exists(path_file)) download.file(url = file.path(path_data, "covid/results_bioc/bioc_covid_qc.rds"), destfile = path_file, method = "curl", extra = curl_upass)
+
 sce <- readRDS(path_file)
 ```
 
@@ -48,23 +54,35 @@ dataset to distinguish cell types. For this purpose, we need to find
 genes that are highly variable across cells, which in turn will also
 provide a good separation of the cell clusters.
 
+With `modelGeneVar` we can specify a blocking parameter, so the trend
+fitting and variance decomposition is done separately for each batch.
+
 ``` {r}
-#| fig-height: 4
-#| fig-width: 8
+#| label: hvg
 
 sce <- computeSumFactors(sce, sizes = c(20, 40, 60, 80))
 sce <- logNormCounts(sce)
-var.out <- modelGeneVar(sce, method = "loess")
+var.out <- modelGeneVar(sce, block = sce$sample)
 hvgs <- getTopHVGs(var.out, n = 2000)
+```
+
+We can plot the *total* variance and the *biological* variance vs mean
+expressioni for one of the samples.
+
+``` {r}
+#| label: plot-hvg
+#| fig-height: 4
+#| fig-width: 8
 
 par(mfrow = c(1, 2))
 # plot mean over TOTAL variance
 # Visualizing the fit:
-fit.var <- metadata(var.out)
+fit.var <- metadata(var.out$per.block[[1]])
 {
     plot(fit.var$mean, fit.var$var,
         xlab = "Mean of log-expression",
-        ylab = "Variance of log-expression"
+        ylab = "Variance of log-expression",
+        main = "Total variance"
     )
     curve(fit.var$trend(x), col = "dodgerblue", add = TRUE, lwd = 2)
 
@@ -77,13 +95,46 @@ fit.var <- metadata(var.out)
 }
 
 {
-    # plot mean over BIOLOGICAL variance
-    plot(var.out$mean, var.out$bio, pch = 16, cex = 0.4, xlab = "Mean log-expression", ylab = "Variance of log-expression")
-    lines(c(min(var.out$mean), max(var.out$mean)), c(0, 0), col = "dodgerblue", lwd = 2)
-    points(var.out$mean[cutoff], var.out$bio[cutoff], col = "red", pch = 16, cex = .6)
+    # plot mean over BIOLOGICAL variance for same sample
+    plot(var.out$per.block[[1]]$mean, var.out$per.block[[1]]$bio, pch = 16, cex = 0.4, 
+         xlab = "Mean log-expression", 
+         ylab = "Variance of log-expression",
+         main = "Biological variance")
+    lines(c(min(var.out$per.block[[1]]$mean), max(var.out$per.block[[1]]$mean)), c(0, 0), col = "dodgerblue", lwd = 2)
+    points(var.out$per.block[[1]]$mean[cutoff], var.out$per.block[[1]]$bio[cutoff], col = "red", pch = 16, cex = .6)
 }
 ```
 
+We can plot the same for all the samples, also showing the technical
+variance.
+
+``` {r}
+#| label: plot-hvg-sample
+#| fig-height: 4
+#| fig-width: 8
+
+cutoff <- rownames(var.out) %in% hvgs
+
+par(mfrow = c(1, 3))
+    plot(var.out$mean, var.out$total, pch = 16, cex = 0.4, 
+         xlab = "Mean log-expression", 
+         ylab = "Variance of log-expression",
+         main = "Total variance")
+    points(var.out$mean[cutoff], var.out$total[cutoff], col = "red", pch = 16, cex = .6)
+
+    plot(var.out$mean, var.out$bio, pch = 16, cex = 0.4, 
+         xlab = "Mean log-expression", 
+         ylab = "Variance of log-expression",
+         main = "Biological variance")
+    points(var.out$mean[cutoff], var.out$bio[cutoff], col = "red", pch = 16, cex = .6)
+    
+    plot(var.out$mean, var.out$tech, pch = 16, cex = 0.4, 
+         xlab = "Mean log-expression", 
+         ylab = "Variance of log-expression",
+         main = "Technical variance")
+    points(var.out$mean[cutoff], var.out$tech[cutoff], col = "red", pch = 16, cex = .6)    
+```
+
 ## Z-score transformation
 
 Now that the genes have been selected, we now proceed with PCA. Since
@@ -108,6 +159,7 @@ function below. Here we will show you how to add the scaledData back to
 the object.
 
 ``` {r}
+#| label: scale
 # sce@assays$data@listData$scaled.data <- apply(exprs(sce)[rownames(hvg.out),,drop=FALSE],2,function(x) scale(x,T,T))
 # rownames(sce@assays$data@listData$scaled.data) <- rownames(hvg.out)
 ```
@@ -122,13 +174,15 @@ We use the `logcounts` and then set `scale_features` to TRUE in order to
 scale each gene.
 
 ``` {r}
+#| label: pca
 # runPCA and specify the variable genes to use for dim reduction with subset_row
-sce <- runPCA(sce, exprs_values = "logcounts", ncomponents = 50, subset_row = hvg.out, scale = TRUE)
+sce <- runPCA(sce, exprs_values = "logcounts", ncomponents = 50, subset_row = hvgs, scale = TRUE)
 ```
 
 We then plot the first principal components.
 
 ``` {r}
+#| label: pca-plot
 #| fig-height: 3.5
 #| fig-width: 10
 
@@ -150,15 +204,27 @@ each PC. This can be important to look at to understand different biases
 you may have in your data.
 
 ``` {r}
+#| label: pca-explanatory
 #| fig-height: 3.5
 #| fig-width: 10
 
-plotExplanatoryPCs(sce)
+plotExplanatoryPCs(sce,nvars_to_plot = 15)
 ```
 
+<div>
+
+> **Discuss**
+>
+> Have a look at the plot from `plotExplanatoryPCs` and the gene
+> loadings plots. Do you think the top components are biologically
+> relevant or more driven by technical noise
+
+</div>
+
 We can also plot the amount of variance explained by each PC.
 
 ``` {r}
+#| label: pca-elbow
 #| fig-height: 4
 #| fig-width: 4
 
@@ -176,6 +242,7 @@ about rare cell types (such as platelets and DCs in this dataset)
 We will now run [BH-tSNE](https://arxiv.org/abs/1301.3342).
 
 ``` {r}
+#| label: run-tsne
 set.seed(42)
 sce <- runTSNE(sce, dimred = "PCA", n_dimred = 30, perplexity = 30, name = "tSNE_on_PCA")
 ```
@@ -184,6 +251,7 @@ We plot the tSNE scatterplot colored by dataset. We can clearly see the
 effect of batches present in the dataset.
 
 ``` {r}
+#| label: plot-tsne
 #| fig-height: 5
 #| fig-width: 7
 
@@ -196,6 +264,8 @@ We can now run [UMAP](https://arxiv.org/abs/1802.03426) for cell
 embeddings.
 
 ``` {r}
+#| label: run-umap
+
 sce <- runUMAP(sce, dimred = "PCA", n_dimred = 30, ncomponents = 2, name = "UMAP_on_PCA")
 # see ?umap and ?runUMAP for more info
 ```
@@ -204,6 +274,7 @@ UMAP is plotted colored per dataset. Although less distinct as in the
 tSNE, we still see quite an effect of the different batches in the data.
 
 ``` {r}
+#| label: run-umap2
 sce <- runUMAP(sce, dimred = "PCA", n_dimred = 30, ncomponents = 10, name = "UMAP10_on_PCA")
 # see ?umap and ?runUMAP for more info
 ```
@@ -216,6 +287,7 @@ our dataset contains a batch effect that needs to be corrected before
 proceeding to clustering and differential gene expression analysis.
 
 ``` {r}
+#| label: plot-umap
 #| fig-height: 3.5
 #| fig-width: 10
 
@@ -261,6 +333,7 @@ unfeasible. Therefore we will use the scaled data of the highly variable
 genes.
 
 ``` {r}
+#| label: run-umap-sd
 sce <- runUMAP(sce, exprs_values = "logcounts", name = "UMAP_on_ScaleData")
 ```
 
@@ -274,6 +347,7 @@ such, you can run either UMAP or tSNE using any other distance metric
 desired. Euclidean and Correlation are usually the most commonly used.
 
 ``` {r}
+#| label: run-umap-graph
 # Build Graph
 nn <- RANN::nn2(reducedDim(sce, "PCA"), k = 30)
 names(nn) <- c("idx", "dist")
@@ -289,6 +363,7 @@ reducedDim(sce, "UMAP_on_Graph") <- uwot::umap(X = NULL, n_components = 2, nn_me
 We can now plot the UMAP comparing both on PCA vs ScaledSata vs Graph.
 
 ``` {r}
+#| label: plot-umap-all
 #| fig-height: 3.5
 #| fig-width: 10
 
@@ -320,6 +395,7 @@ embedding.
   FCER1A, CST3               DCs
 
 ``` {r}
+#| label: plot-markers
 #| fig-height: 14
 #| fig-width: 11
 
@@ -342,11 +418,28 @@ wrap_plots(plotlist, ncol = 3)
 
 </div>
 
+``` {r}
+#| label: plot-qc
+#| fig-height: 8
+#| fig-width: 11
+
+
+    
+plotlist <- list()
+for (i in c("detected", "total", "subsets_mt_percent","subsets_ribo_percent","subsets_hb_percent")) { 
+    plotlist[[i]] <- plotReducedDim(sce, dimred = "UMAP_on_PCA", colour_by = i, by_exprs_values = "logcounts") +
+        scale_fill_gradientn(colours = colorRampPalette(c("grey90", "orange3", "firebrick", "firebrick", "red", "red"))(10)) +
+        ggtitle(label = i) + theme(plot.title = element_text(size = 20))
+}
+wrap_plots(plotlist, ncol = 3)
+```
+
 ## Save data
 
 We can finally save the object for use in future steps.
 
 ``` {r}
+#| label: save
 saveRDS(sce, "data/covid/results/bioc_covid_qc_dr.rds")
 ```
 
@@ -363,6 +456,7 @@ Click here
 </summary>
 ```
 ``` {r}
+#| label: session
 sessionInfo()
 ```