Skip to content

Commit

Permalink
Merge pull request #35 from NBISweden/master
Browse files Browse the repository at this point in the history
All changes for 2025 version of course
  • Loading branch information
asabjorklund authored Feb 28, 2025
2 parents fa27841 + 1123596 commit 7bdfbe0
Show file tree
Hide file tree
Showing 734 changed files with 42,197 additions and 9,599 deletions.
133 changes: 116 additions & 17 deletions compiled/labs/bioc/bioc_01_qc.qmd

Large diffs are not rendered by default.

120 changes: 107 additions & 13 deletions compiled/labs/bioc/bioc_02_dimred.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ First, let's load all necessary libraries and the QC-filtered dataset
from the previous step.

``` {r}
#| label: libraries
suppressPackageStartupMessages({
library(scater)
library(scran)
Expand All @@ -30,14 +31,19 @@ suppressPackageStartupMessages({
```

``` {r}
#| label: fetch-data
# download pre-computed data if missing or long compute
fetch_data <- TRUE
# url for source and intermediate data
path_data <- "https://export.uppmax.uu.se/naiss2023-23-3/workshops/workshop-scrnaseq"
path_data <- "https://nextcloud.dc.scilifelab.se/public.php/webdav"
curl_upass <- "-u zbC5fr2LbEZ9rSE:scRNAseq2025"
path_file <- "data/covid/results/bioc_covid_qc.rds"
if (!dir.exists(dirname(path_file))) dir.create(dirname(path_file), recursive = TRUE)
if (fetch_data && !file.exists(path_file)) download.file(url = file.path(path_data, "covid/results/bioc_covid_qc.rds"), destfile = path_file)
if (fetch_data && !file.exists(path_file)) download.file(url = file.path(path_data, "covid/results_bioc/bioc_covid_qc.rds"), destfile = path_file, method = "curl", extra = curl_upass)
sce <- readRDS(path_file)
```

Expand All @@ -48,23 +54,35 @@ dataset to distinguish cell types. For this purpose, we need to find
genes that are highly variable across cells, which in turn will also
provide a good separation of the cell clusters.

With `modelGeneVar` we can specify a blocking parameter, so the trend
fitting and variance decomposition is done separately for each batch.

``` {r}
#| fig-height: 4
#| fig-width: 8
#| label: hvg
sce <- computeSumFactors(sce, sizes = c(20, 40, 60, 80))
sce <- logNormCounts(sce)
var.out <- modelGeneVar(sce, method = "loess")
var.out <- modelGeneVar(sce, block = sce$sample)
hvgs <- getTopHVGs(var.out, n = 2000)
```

We can plot the *total* variance and the *biological* variance vs mean
expressioni for one of the samples.

``` {r}
#| label: plot-hvg
#| fig-height: 4
#| fig-width: 8
par(mfrow = c(1, 2))
# plot mean over TOTAL variance
# Visualizing the fit:
fit.var <- metadata(var.out)
fit.var <- metadata(var.out$per.block[[1]])
{
plot(fit.var$mean, fit.var$var,
xlab = "Mean of log-expression",
ylab = "Variance of log-expression"
ylab = "Variance of log-expression",
main = "Total variance"
)
curve(fit.var$trend(x), col = "dodgerblue", add = TRUE, lwd = 2)
Expand All @@ -77,13 +95,46 @@ fit.var <- metadata(var.out)
}
{
# plot mean over BIOLOGICAL variance
plot(var.out$mean, var.out$bio, pch = 16, cex = 0.4, xlab = "Mean log-expression", ylab = "Variance of log-expression")
lines(c(min(var.out$mean), max(var.out$mean)), c(0, 0), col = "dodgerblue", lwd = 2)
points(var.out$mean[cutoff], var.out$bio[cutoff], col = "red", pch = 16, cex = .6)
# plot mean over BIOLOGICAL variance for same sample
plot(var.out$per.block[[1]]$mean, var.out$per.block[[1]]$bio, pch = 16, cex = 0.4,
xlab = "Mean log-expression",
ylab = "Variance of log-expression",
main = "Biological variance")
lines(c(min(var.out$per.block[[1]]$mean), max(var.out$per.block[[1]]$mean)), c(0, 0), col = "dodgerblue", lwd = 2)
points(var.out$per.block[[1]]$mean[cutoff], var.out$per.block[[1]]$bio[cutoff], col = "red", pch = 16, cex = .6)
}
```

We can plot the same for all the samples, also showing the technical
variance.

``` {r}
#| label: plot-hvg-sample
#| fig-height: 4
#| fig-width: 8
cutoff <- rownames(var.out) %in% hvgs
par(mfrow = c(1, 3))
plot(var.out$mean, var.out$total, pch = 16, cex = 0.4,
xlab = "Mean log-expression",
ylab = "Variance of log-expression",
main = "Total variance")
points(var.out$mean[cutoff], var.out$total[cutoff], col = "red", pch = 16, cex = .6)
plot(var.out$mean, var.out$bio, pch = 16, cex = 0.4,
xlab = "Mean log-expression",
ylab = "Variance of log-expression",
main = "Biological variance")
points(var.out$mean[cutoff], var.out$bio[cutoff], col = "red", pch = 16, cex = .6)
plot(var.out$mean, var.out$tech, pch = 16, cex = 0.4,
xlab = "Mean log-expression",
ylab = "Variance of log-expression",
main = "Technical variance")
points(var.out$mean[cutoff], var.out$tech[cutoff], col = "red", pch = 16, cex = .6)
```

## Z-score transformation

Now that the genes have been selected, we now proceed with PCA. Since
Expand All @@ -108,6 +159,7 @@ function below. Here we will show you how to add the scaledData back to
the object.

``` {r}
#| label: scale
# sce@assays$data@listData$scaled.data <- apply(exprs(sce)[rownames(hvg.out),,drop=FALSE],2,function(x) scale(x,T,T))
# rownames(sce@assays$data@listData$scaled.data) <- rownames(hvg.out)
```
Expand All @@ -122,13 +174,15 @@ We use the `logcounts` and then set `scale_features` to TRUE in order to
scale each gene.

``` {r}
#| label: pca
# runPCA and specify the variable genes to use for dim reduction with subset_row
sce <- runPCA(sce, exprs_values = "logcounts", ncomponents = 50, subset_row = hvg.out, scale = TRUE)
sce <- runPCA(sce, exprs_values = "logcounts", ncomponents = 50, subset_row = hvgs, scale = TRUE)
```

We then plot the first principal components.

``` {r}
#| label: pca-plot
#| fig-height: 3.5
#| fig-width: 10
Expand All @@ -150,15 +204,27 @@ each PC. This can be important to look at to understand different biases
you may have in your data.

``` {r}
#| label: pca-explanatory
#| fig-height: 3.5
#| fig-width: 10
plotExplanatoryPCs(sce)
plotExplanatoryPCs(sce,nvars_to_plot = 15)
```

<div>

> **Discuss**
>
> Have a look at the plot from `plotExplanatoryPCs` and the gene
> loadings plots. Do you think the top components are biologically
> relevant or more driven by technical noise
</div>

We can also plot the amount of variance explained by each PC.

``` {r}
#| label: pca-elbow
#| fig-height: 4
#| fig-width: 4
Expand All @@ -176,6 +242,7 @@ about rare cell types (such as platelets and DCs in this dataset)
We will now run [BH-tSNE](https://arxiv.org/abs/1301.3342).

``` {r}
#| label: run-tsne
set.seed(42)
sce <- runTSNE(sce, dimred = "PCA", n_dimred = 30, perplexity = 30, name = "tSNE_on_PCA")
```
Expand All @@ -184,6 +251,7 @@ We plot the tSNE scatterplot colored by dataset. We can clearly see the
effect of batches present in the dataset.

``` {r}
#| label: plot-tsne
#| fig-height: 5
#| fig-width: 7
Expand All @@ -196,6 +264,8 @@ We can now run [UMAP](https://arxiv.org/abs/1802.03426) for cell
embeddings.

``` {r}
#| label: run-umap
sce <- runUMAP(sce, dimred = "PCA", n_dimred = 30, ncomponents = 2, name = "UMAP_on_PCA")
# see ?umap and ?runUMAP for more info
```
Expand All @@ -204,6 +274,7 @@ UMAP is plotted colored per dataset. Although less distinct as in the
tSNE, we still see quite an effect of the different batches in the data.

``` {r}
#| label: run-umap2
sce <- runUMAP(sce, dimred = "PCA", n_dimred = 30, ncomponents = 10, name = "UMAP10_on_PCA")
# see ?umap and ?runUMAP for more info
```
Expand All @@ -216,6 +287,7 @@ our dataset contains a batch effect that needs to be corrected before
proceeding to clustering and differential gene expression analysis.

``` {r}
#| label: plot-umap
#| fig-height: 3.5
#| fig-width: 10
Expand Down Expand Up @@ -261,6 +333,7 @@ unfeasible. Therefore we will use the scaled data of the highly variable
genes.

``` {r}
#| label: run-umap-sd
sce <- runUMAP(sce, exprs_values = "logcounts", name = "UMAP_on_ScaleData")
```

Expand All @@ -274,6 +347,7 @@ such, you can run either UMAP or tSNE using any other distance metric
desired. Euclidean and Correlation are usually the most commonly used.

``` {r}
#| label: run-umap-graph
# Build Graph
nn <- RANN::nn2(reducedDim(sce, "PCA"), k = 30)
names(nn) <- c("idx", "dist")
Expand All @@ -289,6 +363,7 @@ reducedDim(sce, "UMAP_on_Graph") <- uwot::umap(X = NULL, n_components = 2, nn_me
We can now plot the UMAP comparing both on PCA vs ScaledSata vs Graph.

``` {r}
#| label: plot-umap-all
#| fig-height: 3.5
#| fig-width: 10
Expand Down Expand Up @@ -320,6 +395,7 @@ embedding.
FCER1A, CST3 DCs

``` {r}
#| label: plot-markers
#| fig-height: 14
#| fig-width: 11
Expand All @@ -342,11 +418,28 @@ wrap_plots(plotlist, ncol = 3)
</div>

``` {r}
#| label: plot-qc
#| fig-height: 8
#| fig-width: 11
plotlist <- list()
for (i in c("detected", "total", "subsets_mt_percent","subsets_ribo_percent","subsets_hb_percent")) {
plotlist[[i]] <- plotReducedDim(sce, dimred = "UMAP_on_PCA", colour_by = i, by_exprs_values = "logcounts") +
scale_fill_gradientn(colours = colorRampPalette(c("grey90", "orange3", "firebrick", "firebrick", "red", "red"))(10)) +
ggtitle(label = i) + theme(plot.title = element_text(size = 20))
}
wrap_plots(plotlist, ncol = 3)
```

## Save data

We can finally save the object for use in future steps.

``` {r}
#| label: save
saveRDS(sce, "data/covid/results/bioc_covid_qc_dr.rds")
```

Expand All @@ -363,6 +456,7 @@ Click here
</summary>
```
``` {r}
#| label: session
sessionInfo()
```

Expand Down
Loading

0 comments on commit 7bdfbe0

Please sign in to comment.