-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathoverlap_phyper_v2.R
90 lines (81 loc) · 2.95 KB
/
overlap_phyper_v2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
overlap_phyper2<-function(L1,L2,bg=length(unique(c(unlist(L1),unlist(L2)))), plot=FALSE,
title="Overlap", remove.diag = FALSE, with.total = TRUE,
to.file = NULL, silent=FALSE, nsize = 15){
# takes two list with indices and creates a matrix with pvalue for overlap between elements
# phyper test uses all entries in L as background if bg is not specified.
nL1<-length(L1)
nL2<-length(L2)
M<-mat.or.vec(nL1,nL2)
P<-mat.or.vec(nL1,nL2)
P[,]<-1
for (i in 1:nL1){
for (j in 1:nL2){
M[i,j]<-length(intersect(L1[[i]],L2[[j]]))
if (M[i,j] ==0) {
P[i,j] <- 1
}else{
P[i,j]<-phyper(M[i,j]-1,length(L1[[i]]),bg-length(L1[[i]]),length(L2[[j]]), lower.tail = FALSE)
}
}
}
colnames(P)<-names(L2)
rownames(P)<-names(L1)
colnames(M)<-names(L2)
rownames(M)<-names(L1)
P[M==0]<-1
# still may have values that are zero
pseudo = min(P[P>0])*0.1
if(remove.diag){
diag(P) = NA
}
# ad column/row with total genes per list.
# lower square is total unique genes in the 2 lists.
if (with.total){
nTotal1 = unlist(lapply(L1,length))
nTotal2 = c(unlist(lapply(L2,length)),length(unique(c(unlist(L1),unlist(L2)))))
M = cbind(M,nTotal1)
M = rbind(M,nTotal2)
P = cbind(P, rep(NA,nrow(P)))
P = rbind(P, rep(NA,ncol(P)))
}
suppressMessages(require(pheatmap))
if (is.null(to.file)){
pl = pheatmap(-log10(P+pseudo), cluster_rows = F, cluster_cols = F, display_numbers = M,
main = title, silent = silent, fontsize_number = nsize)
}else{
pheatmap(-log10(P+pseudo), cluster_rows = F, cluster_cols = F, display_numbers = M,
main = title, filename = to.file, silent = TRUE, fontsize_number = nsize)
}
return(list(P=P,M=M, plot=pl))
}
pred_phyper<-function(L1,L2,bg=length(unique(c(unlist(L1),unlist(L2)))), cutoff=0.01, nM=3, with.cl.names=FALSE){
# takes two list with gene names and creates a matrix with pvalue for overlap
# phyper test uses all entries in L as background if bg is not specified.
# will predict for each group in L1, the one with best overlap to L2
# predict "Unass" if phyper pvalue < cutoff and if no gene overlap is >=nM
# with.cl.names: add in cluster names in the new annotation names
nL1<-length(L1)
nL2<-length(L2)
M<-mat.or.vec(nL1,nL2)
P<-mat.or.vec(nL1,nL2)
P[,]<-1
for (i in 1:nL1){
for (j in 1:nL2){
M[i,j]<-length(intersect(L1[[i]],L2[[j]]))
if (M[i,j] < nM) {
P[i,j] <- 1
}else{
P[i,j]<-phyper(M[i,j]-1,length(L1[[i]]),bg-length(L1[[i]]),length(L2[[j]]), lower.tail = FALSE)
}
}
}
# find lowest pvalue
lowest = apply(P,1,which.min)
assign = names(L2)[lowest]
assign[apply(P,1,min)>cutoff]="Unass"
if (with.cl.names){
assign = paste(names(L1),assign,sep=":")
}
names(assign)=names(L1)
return(assign)
}