.Rhistory

install.packages("revealjs")
## RNA Biology -  The Central Dogma
rjs_twocol
library(knitr)
library(tidyverse)
include_graphics('src/dna_central_dogma_yourgenome.png')
include_graphics('src/rna_isoforms.gif')
library(knitr)
library(tidyverse)
opts_chunk$set(echo = F)
include_graphics('src/rnaseq_lib_prep.png')
read.table(text = txt)
txt <- 'Name    Length  EffectiveLength TPM     NumReads
DNTX_00000017   12501   9261.044        2.532237        2529.632
DNTX_00000019   3218    1783.769        12.960541       2493.762
DNTX_00000020   39670   27993.255       1.370513        4138.375
DNTX_00000018   26868   17512.212       2.156619        4073.875
DNTX_00000034   12032   7527.305        0.713575        579.392
DNTX_00000033   8417    6031.772        0.725138        471.801
DNTX_00000035   3349    1579.033        3.459229        589.201'
read.table(text = txt)
read.table(text = txt ) %>% kable
read.table(text = txt ) %>% kable(col.names = F)
?kable
read.table(text = txt ) %>% kable(col.names = '')
read.table(text = txt, header = T ) %>% kable(col.names = '')
read.table(text = txt, header = T ) %>% kable()
library(tximport)
?require
?tximport
library(knitr)
library(tidyverse)
opts_chunk$set(echo = F)
opts_knit$set(root.dir = '/data/swamyvs/BIOF045-NGS-Data-Analysis-Day-3/')
library("tximport")
library("readr")
library("tximportData")
BiocManager::install("tximportData")
library("tximportData")
library("tximport")
library("readr")
library("tximportData")
library("DESeq2")
BiocManager::install('DESeq2')
library("tximport")
library("readr")
library("tximportData")
library("DESeq2")
library("tximport")
library("readr")
library("tximportData")
library("DESeq2")
dir <- system.file("extdata", package="tximportData")
samples <- read.table(file.path(dir,"samples.txt"), header=TRUE)
samples$condition <- factor(rep(c("A","B"),each=3))
rownames(samples) <- samples$run
samples[,c("pop","center","run","condition")]
files <- file.path(dir,"salmon", samples$run, "quant.sf.gz")
names(files) <- samples$run
tx2gene <- read_csv(file.path(dir, "tx2gene.gencode.v27.csv"))
txi <- tximport(files, type="salmon", tx2gene=tx2gene)
ddsTxi <- DESeqDataSetFromTximport(txi,
colData = samples,
design = ~ condition)
sample_table$treatment
samples
samples$condition
ddsTxi <- DESeqDataSetFromTximport(txi,
colData = samples,
design = ~ condition)
exp <- tibble(Condition = c(rep('wt', 3), rep('ko', 3)),
Replicate = c(rep(1:3, 2)),
Gene_A = rnorm(6, 1))
exp2 <-  tibble(Condition = c(rep('wt', 3), rep('ko', 3)),
Replicate = c(rep(1:3, 2)),
Gene_A = rnorm(6, 1),
Gene_B = rnorm(6, 1))
exp
exp2
plotPCA(ddsTxi)
ddsTxi
dds <- DESeq(ddsTxi)
class(dds)
class(ddsTxi)
dds$assay
dds$pop
dds@rowRanges
dds@assays$`.->data`$counts
class(ddsTxi)
assay(ddsTxi)
head( assay(ddsTxi))
dds <- DESeq(ddsTxi)
message("estimating size factors
using 'avgTxLength' from assays(dds), correcting for library size")
?DESeq
## Estimating Dispersions
```{r, echo = F}
message('estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates')
```
- This is basically accounting for technical variability between expression of individual genes(its actually lot more complicated than that but beyond the scope of this course)
- for a more through explanation, see the documentation for the `estimateDispersions`
minimally_expressed <- rowSums(ddsTxi) >=10
minimally_expressed <- rowSums(assay(ddsTxi)) >=10
ddsTxi <- ddsTxi[minimally_expressed,]
dim(ddsTxi)
ddsTxi <- DESeqDataSetFromTximport(txi,
colData = samples,
design = ~ condition, )
dim(ddsTxi)
minimally_expressed <- rowSums(assay(ddsTxi)) >=10
ddsTxi_filtered <- ddsTxi[minimally_expressed,]
dim(ddsTxi_filtered)
minimally_expressed <- rowSums(assay(ddsTxi)) >=25
ddsTxi_filtered <- ddsTxi[minimally_expressed,]
dim(ddsTxi_filtered)
dds <- estimateSizeFactors(ddsTxi_filtered)
dds <- estimateDispersions(dds)
plotPCA(dds)
?plotPCA
rlds <- rlog(dds)
plotPCA(rlds)
library(ggplot2)
ggplot(df) +
geom_point(aes(x=x, y=y, color = condition))
df <- tibble(x=c(-7, 3, -1, 10),
y=c(-1,-2, 2, 1 ),
condition = c('a', 'a', 'b', 'b'))
ggplot(df) +
geom_point(aes(x=x, y=y, color = condition))
geom_point(aes(x=y, y=x, color = condition)) +
ggplot(df) +
geom_point(aes(x=y, y=x, color = condition)) +
xlab('PC1 45% variance explained') +
ylab('PC2 30% variance explained')
ggplot(df) +
geom_point(aes(x=y, y=x, color = condition)) +
xlab('PC1 45% variance explained') +
ylab('PC2 30% variance explained')
library(knitr)
library(tidyverse)
#opts_chunk$set(echo = F, fig.height = 5, fig.width = 5)
opts_knit$set(root.dir = '/data/swamyvs/BIOF045-NGS-Data-Analysis-Day-3/')
suppressMessages(library(ComplexHeatmap))
cor_mat <- rld %>% assay %>% t %>% cor( method = 'spearman')
cor_mat <- rlds %>% assay %>% t %>% cor( method = 'spearman')
# this is the same as cor(t(assay(rld)))
Heatmap(cor_mat)
cor_mat <- rlds %>% assay %>% cor( method = 'spearman')
# this is the same as cor(assay(rld))
Heatmap(cor_mat)
suppressMessages(library(pheatmap))
# this is the same as cor(assay(rld))
pheatmap(cor_mat ,grid.text(sprintf("%.1f", small_mat[i, j]), x, y, gp = gpar(fontsize = 10))
# this is the same as cor(assay(rld))
pheatmap(cor_mat )
# this is the same as cor(assay(rld))
pheatmap(cor_mat, display_numbers = T)
sample_table
View(cor_mat)
View(samples)
?pheatmap
# this is the same as cor(assay(rld))
pheatmap(cor_mat, display_numbers = T, annotation_row = samples)
# this is the same as cor(assay(rld))
pheatmap(cor_mat, display_numbers = T, annotation_row = samples[,'condition'])
samples[,'condition']
# this is the same as cor(assay(rld))
pheatmap(cor_mat, display_numbers = T, annotation_row = samples %>% select(condition))
# this is the same as cor(assay(rld))
pheatmap(cor_mat, display_numbers = T,
number_format='%.2f', annotation_row = samples %>% select(condition))
# this is the same as cor(assay(rld))
pheatmap(cor_mat, display_numbers = T,
number_format='%.4f', annotation_row = samples %>% select(condition))
# this is the same as cor(assay(rld))
pheatmap(cor_mat,
display_numbers = T,
number_format='%.4f',
annotation_row = samples %>% select(condition))
getwd()
library(knitr)
library(tidyverse)
opts_knit$set(root.dir = "/home/vinay/BIOF045-NGS-Data-Analysis-Day-3") #***CHANGE THIS TO THE RNA-seq DIRECTORY****
quant_files <- list.files(pattern = '/data/day3_RNA-seq/quant', pattern = 'quant.sf', recursive = T, full.names = T )
quant_files <- list.files(pattern = '/data/day3_RNA-seq/quant/', pattern = 'quant.sf', recursive = T, full.names = T )
quant_files <- list.files(path = '/data/day3_RNA-seq/quant/', pattern = 'quant.sf', recursive = T, full.names = T )
quant_files
```{r}
quant_files <- list.files(path = '/data/day3_RNA-seq/quant', pattern = 'quant.sf', recursive = T, full.names = T )
quant_files
gtf <- rtracklayer::readGFF('references/transcript_annotation.gtf.gz')
View(gtf)
tx2gene <- gtf %>% filter(type == 'transcript') %>% select(transcript_id, gene_id) %>% distinct
View(tx2gene)
tx2gene <- gtf %>% filter(type == 'transcript') %>% select(transcript_id, gene_id) %>% distinct()
library(tximport)
?tximport
gtf <- rtracklayer::readGFF('/data/day3_RNA-seq/references/transcript_annotation.gtf.gz')
txi <- tximport(files =quant_files,
type = 'salmon',
tx2gene = tx2gene
)
names(txi)
txi$abundance
names(txi)
txi$abundance
quant_files
quant_files %>% str_split('/') %>% sapply(function(x) x[4])
quant_files %>% str_split('/') %>% sapply(function(x) x[5])
sample_names <- quant_files %>% str_split('/') %>% sapply(function(x) x[5])
colnames(txi$abundance) <-sample_names
colnames(txi$counts) <- sample_names
txi$llength
txi$length
names(txi)
sample_names <- quant_files %>% str_split('/') %>% sapply(function(x) x[5])
colnames(txi$abundance) <-sample_names
colnames(txi$counts) <- sample_names
colnames(txi$length) <- sample_names
quant_files
quant_files
quant_files
View(quant_files)
quant_files
quant_files
quant_files
txi <- tximport(files =quant_files,
type = 'salmon',
tx2gene = tx2gene
)
names(txi)
names(txi)
txi$abundance
colnames(txi$abundance) <-sample_names
colnames(txi$counts) <- sample_names
colnames(txi$length) <- sample_names
txi$abundance
sample_table <- read_csv('src/sample_table.csv')
View(sample_table)
sample_table <- read_csv('src/sample_table.csv')
library(knitr)
library(tidyverse)
opts_knit$set(root.dir = "/home/vinay/BIOF045-NGS-Data-Analysis-Day-3") #***CHANGE THIS TO THE RNA-seq DIRECTORY****
sample_table <- read_csv('src/sample_table.csv')
dds <- DESeqDataSetFromTximport(txi,sample_table, ~treament )
library(DESeq2)
sample_table <- read_csv('src/sample_table.csv')
dds <- DESeqDataSetFromTximport(txi,sample_table, ~treament )
dds <- DESeqDataSetFromTximport(txi,sample_table, ~treatment )
dds
getwd()
library(knitr)
library(tidyverse)
opts_knit$set(root.dir = "/home/vinay/BIOF045-NGS-Data-Analysis-Day-3") #***CHANGE THIS TO THE RNA-seq DIRECTORY****
gene_sums <-   rowSum(dds)
gene_sums <-   rowSums(dds)
assay(dds)
gene_sums <-   rowSums(assay(dds))
assay(dds))
assay(dds)
gene_sums
gene_sums_above_16 <- gene_sums > 16
dds <- dds[gene_sums_above_16 , ]
dds <- DESeqDataSetFromTximport(txi,sample_table, ~treatment )
dim(dds)
gene_sums <-   rowSums(assay(dds))
gene_sums_above_16 <- gene_sums > 16
dds <- dds[gene_sums_above_16 , ]
dim(dds)
dds <- estimateSizeFactors(dds)
dds <- estima
dds <- estimateDispersions(dds)
colSums(assay(dds))
rlds <- rlog(dds)
plotPCA(rlds)
plotPCA(rlds, )
rlds <- rlog(dds)
plotPCA(rlds)
dds <- DESeqDataSetFromTximport(txi,sample_table, ~treatment )
dim(dds)
gene_sums <-   rowSums(assay(dds))
gene_sums_above_16 <- gene_sums > 16
dds <- dds[gene_sums_above_16 , ]
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
rlds <- rlog(dds)
plotPCA(rlds)
rlds
plotPCA(Rrlds
plotPCA(rlds)
mat <- assay(rlds)
cor(mat)
cor_mat <- cor(mat)
library(pheatmap)
pheatmap
pheatmap(cor_mat)
?plotPCA
plotPCA(rlds, intgroup= 'treatment')
plotPCA(rlds)
plotPCA(rlds, intgroup= 'treatment')
pca_Data <-  plotPCA(rlds, intgroup= 'treatment', returnData=T)
pca_Data
pheatmap(cor_mat)x
pheatmap(cor_mat)
pheatmap(cor_mat,
annotation = sample_table,
annotation_row = 'treatment')
sample_table
?pheatmap
pheatmap(cor_mat,
annotation_row = sample_table)
pheatmap(cor_mat,
annotation_row = sample_table)
rownames(sample_table) <- sample_table$sample
pheatmap(cor_mat,
annotation_row = sample_table)
pheatmap(cor_mat)
cor_mat <- cor(mat)
pheatmap(cor_mat)
dds <- nbinomLRT(dds, full = design(dds), reduced = ~1)
dds
results(dds)
diff_exp_results <- results(dds) %>% as.data.frame
View(diff_exp_results)
colnames(diff_exp_results )
diff_exp_results <- results(dds) %>% as.data.frame %>%
rownames_to_column('gene_id') %>%
filter(abs(log2FoldChange) > 2, padj < .05 )
View(diff_exp_results)
diff_exp_results
id2gene <- gtf %>% filter(type  == 'gene') %>% select(gene_id, gene_name) %>% distinct
diff_exp_gene_names <- id2gene %>% filter(gene_id %in% diff_exp_results$gene_id) %>% pull(gene_name)
diff_exp_gene_names
library(clusterProfiler)
gse <- enrichGO(gene = diff_exp_gene_names,
OrgDb = 'org.Hs.eg.db',
keyType = 'SYMBOL',
ont = 'BP'
)
gse <- enrichGO(gene = diff_exp_gene_names,
OrgDb = 'org.Hs.eg.db',
keyType = 'SYMBOL',
ont = 'BP'
)
```
dotplot(gse)
diff_exp_results %>% arrange(padj) %>% .[1:20, ]
top20 <-  diff_exp_results %>% arrange(padj) %>% .[1:20, ]
top20 <-  diff_exp_results %>% arrange(padj) %>% .[1:20, ] %>%
pull(gene_id)
mat
mat
mat <-  assay(rlds)
pheatmap(mat[top20,])
library(EnhancedVolcano)
EnhancedVolcano(rlds)
EnhancedVolcano(diff_exp_results,
lab = rownames(diff_exp_results),
FCcutoff = 2,
x = 'log2FoldChange',
y = 'pvalue')