-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path02B_ARE_ERE_func.R
177 lines (167 loc) · 7.47 KB
/
02B_ARE_ERE_func.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# 0. Import libraries
library(readxl)
library(stringr)
library(reshape)
library(ggplot2)
# 1. Import data
ImportSignDE <- function(main_dir, ext, row_col) {
if (missing(ext)) {
deg_files <- list.files(path = main_dir, pattern = "\\.csv$",full.names = TRUE)
if (missing(row_col)) {
deg <- lapply(deg_files, read.csv, row.names=1)
}
else {
deg <- lapply(deg_files, read.csv, row.names=row_col)
}
}
else {
deg_files <- list.files(path = main_dir, pattern = paste0("\\.",ext,"$"),full.names = TRUE)
if (missing(row_col)) {
deg <- lapply(deg_files, read.csv, row.names=1)
}
else {
deg <- lapply(deg_files, read.csv, row.names=row_col)
}
}
names_deg <- list.files(path = main_dir, pattern = "\\.csv$",full.names = FALSE)
names(deg) <- substr(names_deg, 1, nchar(names_deg)-4)
return(deg)
}
# 2. Retrieve DEGs
ReadRawData <- function(main_dir,dis_type, sex, ext, row_col) {
path <- paste0(main_dir, dis_type, "/01A_DEGs")
sub_ct <- list.dirs(path, recursive=FALSE, full.names = FALSE)
df_sex <- list()
df_names <- vector()
for (ct in 1:length(sub_ct)) {
deg <- ImportSignDE(paste(path, sub_ct[ct], sep="/"))
for (i in names(deg)) {
if (grepl(sex, i, fixed=TRUE)){
df_names <- c(df_names, paste(sub_ct[ct], i, sep="_"))
df_sex <- append(df_sex, list(deg[[i]]))
}
}
}
names(df_sex) <- df_names
return(df_sex)
}
ReadRawData2 <- function(main_dir, dis_type, sex, ext, row_col) {
path <- paste0(main_dir, dis_type, "/outputs/01B_num_DEGs/")
sub_ct <- list.dirs(path, recursive=FALSE, full.names = FALSE)
df_sex <- list()
df_names <- vector()
for (ct in 1:length(sub_ct)) {
deg <- ImportSignDE(paste(path, sub_ct[ct], sep="/"))
for (i in names(deg)) {
if (grepl(sex, i, fixed=TRUE)){
df_names <- c(df_names, paste(sub_ct[ct], i, sep="_"))
df_sex <- append(df_sex, list(deg[[i]]))
}
}
}
names(df_sex) <- df_names
return(df_sex)
}
# 3. Calculate ARE df
AREdf <- function(df_sex, sex, ARE_DF) {
ARE_ls <- list("full" = unique(setdiff(ARE_DF$fullsites, ARE_DF$halfsites)),
"half" = unique(setdiff(ARE_DF$halfsites, ARE_DF$fullsites)),
"hf" = unique(intersect(ARE_DF$fullsites, ARE_DF$halfsites)))
df_ARE <- data.frame(as.factor(names(df_sex)))
colnames(df_ARE) <- c("ct")
df_ARE$ct <- sapply(1:length(df_ARE$ct), function(i) str_replace(df_ARE$ct[i], paste0("_", sex, "_filt"), ""))
df_ARE$bg <- sapply(1:length(names(df_sex)), function(i) nrow(df_sex[[i]]))
df_ARE$full <- sapply(1:length(names(df_sex)), function(i) length(intersect(rownames(df_sex[[i]]), ARE_ls[["full"]])))
df_ARE$half <- sapply(1:length(names(df_sex)), function(i) length(intersect(rownames(df_sex[[i]]), ARE_ls[["half"]])))
df_ARE$hf <- sapply(1:length(names(df_sex)), function(i) length(intersect(rownames(df_sex[[i]]), ARE_ls[["hf"]])))
df_ARE$no_overlap <- df_ARE$bg - df_ARE$full - df_ARE$half - df_ARE$hf
return(df_ARE)
}
# 4. Calculate percentages of ARE sites
AREdfPerc <- function(main_dir, dis_type, df_ARE, sex) {
out_path <- paste0(main_dir, dis_type, "/outputs/02B_ARE_ERE/")
dir.create(out_path, showWarnings = F, recursive = T)
df_ARE <- transform(df_ARE, full_perc = full * 100 / bg)
df_ARE <- transform(df_ARE, half_perc = half * 100 / bg)
df_ARE <- transform(df_ARE, hf_perc = hf * 100 / bg)
df_ARE <- transform(df_ARE, no_overlap_perc = no_overlap * 100 / bg)
write.csv(df_ARE, paste0(out_path, sex, "_ARE_sites.csv"))
df_ARE_perc <- df_ARE[, c(1, 7:10)]
df_ARE_perc <- melt(df_ARE_perc, id.vars = "ct")
names(df_ARE_perc)[names(df_ARE_perc) == 'value'] <- 'percent'
names(df_ARE_perc)[names(df_ARE_perc) == 'variable'] <- 'sites'
col_factors <- c("ct")
df_ARE_perc[col_factors] <- lapply(df_ARE_perc[col_factors], as.factor)
levels(df_ARE_perc$sites) <- c('Full', 'Half', 'Half-Full', 'None')
return(df_ARE_perc)
}
# 5. Calculate ERE df
EREdf <- function(df_sex, sex, ERE_gene) {
df_ERE <- data.frame(as.factor(names(df_sex)))
colnames(df_ERE) <- c("ct")
df_ERE$ct <- sapply(1:length(df_ERE$ct), function(i) str_replace(df_ERE$ct[i], paste0("_", sex, "_filt"), ""))
df_ERE$bg <- sapply(1:length(names(df_sex)), function(i) nrow(df_sex[[i]]))
df_ERE$ERE_overlap <- sapply(1:length(names(df_sex)), function(i) length(intersect(rownames(df_sex[[i]]), unique(ERE_gene))))
df_ERE$no_overlap <- df_ERE$bg - df_ERE$ERE_overlap
return(df_ERE)
}
# 6. Calculate percentages of ARE sites
EREdfPerc <- function(main_dir, dis_type, df_ERE, sex) {
out_path <- paste0(main_dir, dis_type, "/outputs/02B_ARE_ERE/")
dir.create(out_path, showWarnings = F, recursive = T)
df_ERE <- transform(df_ERE, ERE_perc = ERE_overlap * 100 / bg)
df_ERE <- transform(df_ERE, no_overlap_perc = no_overlap * 100 / bg)
write.csv(df_ERE, paste0(out_path, sex, "_ERE_sites.csv"))
df_ERE_perc <- df_ERE[, c(1, 5:6)]
df_ERE_perc <- melt(df_ERE_perc, id.vars = "ct")
names(df_ERE_perc)[names(df_ERE_perc) == 'value'] <- 'percent'
names(df_ERE_perc)[names(df_ERE_perc) == 'variable'] <- 'sites'
col_factors <- c("ct")
df_ERE_perc[col_factors] <- lapply(df_ERE_perc[col_factors], as.factor)
levels(df_ERE_perc$sites) <- c("ERE", "None")
return(df_ERE_perc)
}
# 5. Plot ARE or ERE sites
PlotAREERE <- function(main_dir, dis_type, df_sex, sex, are_ere, ct_ordered) {
out_path <- paste0(main_dir, dis_type, "/outputs/02B_ARE_ERE/")
dir.create(out_path, showWarnings = F, recursive = T)
if (are_ere=="ARE") {
col_palette <- c("#39B600", "#9590FF","#D376FF" , "#FD61D1")
} else if (are_ere=="ERE") {
col_palette <- c("#39B600", "#9590FF")
}
dis_ct_ordered <- ct_ordered[which(ct_ordered %in% levels(df_sex$ct))]
df_sex$ct <- factor(df_sex$ct, dis_ct_ordered)
df_sex <- df_sex[order(df_sex$ct), ]
pdf(paste0(out_path, sex, "_", are_ere, "_sites.pdf"))
print(ggplot(df_sex, aes(ct, percent, fill=sites)) +
geom_bar(stat="identity", position="stack", color="black") +
labs(title=sex, x="Cell types", y=paste0("% of ", are_ere, " sites"), fill=paste0("Overlap ", are_ere , " sites")) +
{if (are_ere=="ARE") scale_fill_manual(values = c('Full' = col_palette[4] , 'Half' = col_palette[3], 'Half-Full' = col_palette[2], 'None' = col_palette[1]))} +
{if (are_ere=="ERE") scale_fill_manual(values = c("ERE" = col_palette[2], "None" = col_palette[1]))} +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
axis.title.x = element_blank(),
axis.text.x = element_text(size=8, colour = "black",angle = 90, vjust = 0.7, hjust=0.5),
axis.ticks.x=element_blank(),
axis.title.y = element_text(size=12, face="bold", colour = "black"),
legend.position = "bottom",
legend.title = element_text(size=12, face="bold", colour = "black"))
)
dev.off()
}
# 7. MAIN
AnalysisARE_ERE <- function(main_dir, dis_type, ARE_DF, ERE_gene, ct_ordered) {
sexes <- c("F", "M")
for (sex in sexes) {
df_sex <- ReadRawData2(main_dir, dis_type, sex)
ARE_sex <- AREdf(df_sex, sex, ARE_DF)
ARE_sex <- AREdfPerc(main_dir, dis_type, ARE_sex, sex)
ERE_sex <- EREdf(df_sex, sex, ERE_gene)
ERE_sex <- EREdfPerc(main_dir, dis_type, ERE_sex, sex)
PlotAREERE(main_dir, dis_type, ARE_sex, sex, "ARE", ct_ordered)
PlotAREERE(main_dir, dis_type, ERE_sex, sex, "ERE", ct_ordered)
}
}