-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoverlap.r
186 lines (172 loc) · 10.1 KB
/
overlap.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
###############################################################
### File: overlap.r
### Description: Performs overlap analysis of results from
### microarray and RNA-seq differential gene
### expression analysis.
### Usage: Rscript overlap.r <log2FC> <fdr> <output_dir> <labels> <display_labels> <files> <microarray_annotation> <rnaseq_annotation>
### Written by Delaney Sullivan
###############################################################
# Rscript overlap.r 2 0.05 output/mouse_de/ liver_myc,kidney_myc,lung_myc,lung_mycras,lung_ras,tall_myc,eumyc_myc "HCC,RCC,LAC (MYC),LAC (MYC+KRAS),LAC (KRAS),T-ALL,BCL" microarray/processed_microarray/tissue/liver_diffexpr_gene.csv,microarray/processed_microarray/tissue/kidney_diffexpr_gene.csv,microarray/processed_microarray/tissue/lung_diffexpr_gene.csv,microarray/processed_microarray/tissue/lung_mycras_diffexpr_gene.csv,microarray/processed_microarray/tissue/lung_ras_diffexpr_gene.csv,rnaseq/tall_rnaseq/kallisto_aligned/tallmycon_sleuth_results_genes.csv,rnaseq/eumyc_rnaseq/kallisto_aligned/eumycon_sleuth_results_genes.csv microarray/annotation/MouseWG-6v2.csv rnaseq/annotation/gencode_GRCm38_vM15.csv
# Load R packages
pkgs <- c("UpSetR", "VennDiagram", "openxlsx")
invisible(lapply(pkgs, function(x) suppressWarnings(suppressMessages(library(x, character.only=TRUE)))))
# Read command-line arguments
args <- commandArgs(TRUE)
lfcThreshold <- log2(as.numeric(args[1]))
fdrThreshold <- as.numeric(args[2])
outputFolder <- args[3]
labels <- unlist(strsplit(args[4], ","))
display_labels <- unlist(strsplit(args[5], ","))
files <- unlist(strsplit(args[6], ","))
microarray_annotation <- read.csv(args[7], stringsAsFactors=FALSE, header=TRUE)
microarray_annotation <- microarray_annotation[!is.na(microarray_annotation$ID),]
microarray_annotation <- microarray_annotation[!duplicated(microarray_annotation$ID),]
rnaseq_annotation <- read.csv(args[8], stringsAsFactors=FALSE, header=TRUE)
rnaseq_annotation <- rnaseq_annotation[!is.na(rnaseq_annotation$ID),]
if (!all(c(length(labels), length(display_labels)) == length(files))) {
stop("<labels>, <display_labels>, and <files> must all have same number of comma-separated values")
}
data <- NULL
data_padj <- NULL
for (i in 1:length(files)) {
f <- files[i]
label <- labels[i]
df <- read.csv(f, stringsAsFactors=FALSE, header=TRUE)
if ("b" %in% colnames(df)) { # Sleuth RNA-seq data
df <- df[,c("gene_name","b","qval")]
df <- merge(rnaseq_annotation, df, by.x="Symbol", by.y="gene_name")
df$Symbol <- NULL
rownames(df) <- df$ID
df$ID <- NULL
colnames(df) <- c("logFC","padj")
} else { # Limma microarray data
df <- df[,c("ID","logFC","adj.P.Val")]
rownames(df) <- df$ID
df$ID <- NULL
colnames(df) <- c("logFC","padj")
}
df_logFC <- df[,"logFC",drop=FALSE]
colnames(df_logFC) <- label
df_padj <- df[,"padj",drop=FALSE]
colnames(df_padj) <- label
if (is.null(data)) {
data <- df_logFC
data_padj <- df_padj
} else {
data <- transform(merge(data,df_logFC,by=0,all=TRUE), row.names=Row.names, Row.names=NULL)
data_padj <- transform(merge(data_padj,df_padj,by=0,all=TRUE), row.names=Row.names, Row.names=NULL)
}
}
# Filter differentially expressed genes
data_significant_up <- data
data_significant_dn <- data
data_significant_up[!is.na(data) & data > lfcThreshold & !is.na(data_padj) & data_padj < fdrThreshold] <- TRUE
data_significant_up[is.na(data) | data <= lfcThreshold | is.na(data_padj) | data_padj >= fdrThreshold] <- FALSE
data_significant_dn[!is.na(data) & data < -lfcThreshold & !is.na(data_padj) & data_padj < fdrThreshold] <- TRUE
data_significant_dn[is.na(data) | data >= -lfcThreshold | is.na(data_padj) | data_padj >= fdrThreshold] <- FALSE
# Create excel spreadsheet table
data_order <- data_significant_up*2 + data_significant_dn
data_ordering <- order(rowSums(data_order != 0), apply(data_order, 1, function(x) { sort(table(x),decreasing=TRUE)[1] }), rowSums(data_order == 2), rowSums(data_order == 1), apply(data_order, 1, paste, collapse="-"), rowSums(!is.na(data)), decreasing=TRUE)
data_order <- data_order[data_ordering,]
data_excel <- data[data_ordering,]
colnames(data_excel) <- display_labels
annotation_intersection <- intersect(microarray_annotation$ID, rnaseq_annotation$ID)
microarray_remaining <- microarray_annotation[!(microarray_annotation$ID %in% annotation_intersection),]
microarray_remaining <- microarray_remaining[!duplicated(microarray_remaining$ID),c("Symbol","ID")]
annotation_all <- rbind(rnaseq_annotation, microarray_remaining)
data_excel <- merge(annotation_all, data_excel, by.x="ID", by.y=0, all.x=FALSE, all.y=TRUE)
data_excel <- data_excel[match(rownames(data_order), data_excel$ID),]
wb <- createWorkbook("Differentially Expressed Genes")
addWorksheet(wb, "Differentially Expressed Genes")
writeData(wb, sheet = 1, data_excel, rowNames = FALSE)
styleUp <- createStyle(fgFill = "red")
styleDn <- createStyle(fgFill = "blue")
for (i in 1:ncol(data_order)) {
addStyle(wb, sheet = 1, styleUp, rows = which(data_order[,i] == 2) + 1, cols = i+2)
addStyle(wb, sheet = 1, styleDn, rows = which(data_order[,i] == 1) + 1, cols = i+2)
}
saveWorkbook(wb, paste(outputFolder, "/de_table.xlsx", sep=""), overwrite = TRUE)
# Prepare to make UpSet plots and also output information about DE genes
sink(file=paste(outputFolder, "/log.txt", sep=""), split=TRUE)
listInputUp <- list()
listInputDn <- list()
de_genes_list_up <- list()
de_genes_list_dn <- list()
for (i in 1:ncol(data)) {
de_genes_list_up[[labels[i]]] <- rownames(data_significant_up[data_significant_up[,i] == TRUE,i,drop=FALSE])
de_genes_list_dn[[labels[i]]] <- rownames(data_significant_dn[data_significant_dn[,i] == TRUE,i,drop=FALSE])
display_label <- display_labels[i] # Label to display for UpSet plot
display_label <- unlist(strsplit(display_label, " \\(.*\\)")) # Remove stuff in parantheses in display label
if (!(display_label %in% names(listInputUp))) {
listInputUp[[display_label]] <- de_genes_list_up[[labels[i]]]
listInputDn[[display_label]] <- de_genes_list_dn[[labels[i]]]
}
print(paste(display_labels[i], "-", labels[i], "-", (sum(data_significant_up[,i]) + sum(data_significant_dn[,i])), "DE genes:", sum(data_significant_up[,i]), "up and", sum(data_significant_dn[,i]), "down", sep=" "))
}
print(paste("Number of genes total:", nrow(data)))
sink()
outputList <- function(de_genes_list, output_file, annotation=NULL) {
max.length <- max(sapply(de_genes_list, length))
df_genes <- lapply(de_genes_list, function(v) { c(v, rep(NA, max.length-length(v)))})
df_genes <- t(do.call(rbind, df_genes))
if (!is.null(annotation)) {
col_names <- colnames(df_genes)
df_genes <- as.data.frame(matrix(with(annotation, Symbol[match(df_genes[ ,1:ncol(df_genes)], ID)]), nrow = nrow(df_genes)))
colnames(df_genes) <- col_names
}
write.table(df_genes, file=output_file, sep="\t", quote=FALSE, row.names=FALSE, na="")
}
outputList(de_genes_list_up, paste(outputFolder, "/de_genes_up_ids.txt", sep=""))
outputList(de_genes_list_dn, paste(outputFolder, "/de_genes_down_ids.txt", sep=""))
outputList(de_genes_list_up, paste(outputFolder, "/de_genes_up_symbols.txt", sep=""), annotation_all)
outputList(de_genes_list_dn, paste(outputFolder, "/de_genes_down_symbols.txt", sep=""), annotation_all)
# Make UpSet plots
pdf(paste(outputFolder, "/upset_up.pdf", sep=""), width=10, height=7.5, onefile=FALSE)
upset(fromList(listInputUp), mb.ratio=c(0.65, 0.35), text.scale=c(2, 2, 1.5, 1.5, 2, 1.6), scale.intersections="identity", nsets=length(names(listInputUp)), order.by="degree", point.size=3.5, line.size=1.5)
device <- dev.off()
pdf(paste(outputFolder, "/upset_down.pdf", sep=""), width=10, height=7.5, onefile=FALSE)
upset(fromList(listInputDn), mb.ratio=c(0.65, 0.35), text.scale=c(2, 2, 1.5, 1.5, 2, 1.6), scale.intersections="identity", nsets=length(names(listInputDn)), order.by="degree", point.size=3.5, line.size=1.5)
device <- dev.off()
# Output summary table
if (length(listInputUp) == 5) { # If we have 5 sets, draw a Venn Diagram as well
pdf(paste(outputFolder, "/venn.pdf", sep=""), width=14, height=7)
listInputUp <<- listInputUp
listInputDn <<- listInputDn
overlap <- function(...) {
set_indices <- list(...)
total <- NULL
for (i in set_indices) {
if (is.null(total)) {
total <- listInputUp[[i]]
} else {
total <- intersect(total, listInputUp[[i]])
}
}
total2 <- NULL
for (i in set_indices) {
if (is.null(total2)) {
total2 <- listInputDn[[i]]
} else {
total2 <- intersect(total2, listInputDn[[i]])
}
}
return(length(total) + length(total2))
}
draw.quintuple.venn(overlap(1), overlap(2), overlap(3), overlap(4), overlap(5), overlap(1,2), overlap(1,3), overlap(1,4), overlap(1,5),
overlap(2,3), overlap(2,4), overlap(2,5), overlap(3,4), overlap(3,5), overlap(4,5), overlap(1,2,3), overlap(1,2,4), overlap(1,2,5), overlap(1,3,4),
overlap(1,3,5), overlap(1,4,5), overlap(2,3,4), overlap(2,3,5), overlap(2,4,5), overlap(3,4,5), overlap(1,2,3,4), overlap(1,2,3,5),
overlap(1,2,4,5), overlap(1,3,4,5), overlap(2,3,4,5), overlap(1,2,3,4,5), category = names(listInputUp),
lwd = rep(2, 5), lty = rep("solid", 5), col =
rep("black", 5), fill = c("dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3"), alpha = rep(0.5, 5),
label.col = rep("black", 31), cex = c(rep(2.15, 5), rep(1.75, 20), rep(2.3, 5), 2.6),
fontface = c(rep("plain", 25), rep("bold", 6)), fontfamily = rep("serif",
31), cat.pos = c(0, 287.5+50, 215+25, 145, 70-50), cat.dist =
rep(0.22, 5), cat.col = rep("black", 5), cat.cex =
rep(2.5, 5), cat.fontface = rep("plain", 5),
cat.fontfamily = rep("serif", 5), cat.just =
rep(list(c(0.5, 0.5)), 5), rotation.degree = 0,
rotation.centre = c(0.5, 0.5), ind = TRUE, cex.prop =
NULL, print.mode = "raw", sigdigs = 3, direct.area =
FALSE, area.vector = 0, margin=0.022)
device <- dev.off()
}