-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolymorphism.py
287 lines (238 loc) · 11 KB
/
polymorphism.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
def count_polymorphic_sites(fasta):
'''
(file) -> int
Count the number of polymorphic sites in an multiple sequence alignment fasta file
Precondition: all sequences are aligned and have the same length
'''
# convert the alignment file into a dictionnary
from mirna_data_for_meme import convert_fasta
alignment = convert_fasta(fasta)
# fix the case of all sequences and make sure gaps are '-'
for gene in alignment:
alignment[gene] = alignment[gene].upper()
alignment[gene] = alignment[gene].replace('~', '-')
# make a list of keys
genes = []
for gene in alignment:
genes.append(gene)
polymorphisms = 0
reference = genes[0]
for i in range(len(alignment[reference])):
nucleotides = set()
for gene in alignment:
nucleotides.add(alignment[gene][i])
# exclude gap sites, Ns and sites, ambiguous sites and sites with more than 2 alleles
for base in nucleotides:
if base not in {'A', 'T', 'C', 'G'}:
nucleotides = set()
if len(nucleotides) == 2:
polymorphisms += 1
return polymorphisms
def compute_theta(fasta):
'''
(file) -> float
Return the Waterson estimator theta per site of nuleotide diversity for the fasta alignment file
'''
polymorphisms = count_polymorphic_sites(fasta)
# compute a: sum of 1/i from 1 to N-1 with N being the sample size
from mirna_data_for_meme import convert_fasta
alignment = convert_fasta(fasta)
N = len(alignment)
a = 0
for i in range(1, N):
a += 1/i
# compute theta per sequence
theta_sequence = polymorphisms / a
# make a list of keys
genes = []
for gene in alignment:
genes.append(gene)
reference = genes[0]
Total_Length = len(alignment[reference])
# compute the number of sites excluding gaps, sites with Ns or ambiguous sites
N_sites = Total_Length
for i in range(Total_Length):
nucleotides = set()
excluded_sites = set()
for gene in alignment:
nucleotides.add(alignment[gene][i])
for base in nucleotides:
if base not in {'A', 'T', 'C', 'G'}:
excluded_sites.add(base)
if len(excluded_sites) != 0:
N_sites -= 1
# compute theta per site
theta = theta_sequence / N_sites
# round theta to the 6th digit
return round(theta, 6)
def count_polymorphic_sites_motif(alignment, start, stop):
'''
(dict, str, int, int) -> int
Count the number of polymorphic sites in the alignment between positions start and stop
Precondition: all sequences are aligned and have the same length
'''
polymorphisms = 0
for i in range(start, stop):
nucleotides = set()
for gene in alignment:
nucleotides.add(alignment[gene][i])
# exclude gap sites, Ns and sites, ambiguous sites and sites with more than 2 alleles
for base in nucleotides:
if base not in {'A', 'T', 'C', 'G'}:
nucleotides = set()
if len(nucleotides) == 2:
polymorphisms += 1
return polymorphisms
def compute_theta_motif(fasta, mast_output, Length, upstreams, outputfile):
'''
(file, file, int, file, file) -> file
Return a dictionnary with theta for a given motif of size Length found in a mirna upstreams
with positions stored in mast_output from an multiple sequence alignment fasta file
Precondition: all sequences are aligned and have the same length
'''
from get_positions_from_mast import motif_position_from_mast
from mirna_data_for_meme import convert_fasta
import re
# convert the alignment file into a dictionnary
alignment = convert_fasta(fasta)
# fix the case of all sequences and make sure gaps are '-'
for gene in alignment:
alignment[gene] = alignment[gene].upper()
alignment[gene] = alignment[gene].replace('~', '-')
# find the reference sequence containing the motif positions in alignment
for gene in alignment:
if 'crm' in gene:
reference = gene
# get the positions from mast {mirna: [positions]}
position_tuple = motif_position_from_mast(mast_output)
motif_positions = position_tuple[1]
# create a dictionnary to store the motifs {mirna:[]}
motifs_seq = {}
for mirna in motif_positions:
motifs_seq[mirna] = []
# get the motifs from the file of upstream sequense
# store the motifs in the dictionary {mirna:[motifs]}
upstream_seqs = convert_fasta(upstreams)
for mirna in motif_positions:
for position in motif_positions[mirna]:
motif = upstream_seqs[mirna][position: position + Length].upper()
motifs_seq[mirna].append(motif)
# create a dictionnary with reference as key and set of patterns as value
# store the patterns in a set so as to avoid duplicate patterns
# each pattern must be unique because each pattern will find all motifs using regex
# if duplicate patterns are kept, divergence will be computed multiple times on the same motif
mirna_pattern = {}
if reference in motifs_seq:
for motif in motifs_seq[reference]:
pattern_list = []
pattern = ''
# a pattern include 3 gaps at most at each position
for i in range(len(motif)-1):
motif_with_gap = motif[0:i+1] +'-{0,3}' + motif[i+1:]
pattern_list.append(motif_with_gap)
for new_motif in pattern_list:
pattern = pattern + new_motif + '|'
pattern = pattern[0:-1]
if reference in mirna_pattern:
mirna_pattern[reference].add(pattern)
else:
mirna_pattern[reference] = {pattern,}
# create a dictionnary to store the theta values of each motif for a given mirna
mirna_polym = {}
if reference in mirna_pattern:
for pattern in mirna_pattern[reference]:
for motif in re.finditer(pattern, alignment[reference]):
start = motif.start()
stop = motif.start() + len(motif.group())
N_polym = count_polymorphic_sites_motif(alignment, start, stop)
N_sites = len(motif.group())
a = 0
N = len(alignment)
for i in range(1, N):
a += 1/i
theta_sequence = N_polym / a
for i in range(N_sites):
nucleotides = set()
excluded_sites = set()
for gene in alignment:
nucleotides.add(alignment[gene][i])
for base in nucleotides:
if base not in {'A', 'T', 'C', 'G'}:
excluded_sites.add(base)
if len(excluded_sites) != 0:
N_sites -= 1
if N_sites != 0:
theta = theta_sequence / N_sites
if reference in mirna_polym:
mirna_polym[reference].append(theta)
else:
mirna_polym[reference] = [theta]
# store the theta values into a file
if mirna_polym != {}:
with open(outputfile, 'a+') as myfile:
for theta in mirna_polym[reference]:
myfile.write(reference + '\t' + str(theta) + '\n')
def randomization_theta_motif(Length, N_repeats, outputfile):
'''
(int, int, file) -> file
Compute theta for N_repeats number of motifs of size Length chosen at random
and save the values in the outputfile
'''
import random
from mirna_data_for_meme import convert_fasta
filenames = {0: 'mir-79.txt', 1: 'mir-80.txt', 2: 'mir-81.txt', 3: 'mir-82.txt', 4: 'mir-83.txt',
5: 'mir-85.txt', 6: 'mir-45.txt', 7: 'mir-46.txt', 8: 'mir-47.txt', 9: 'mir-48.txt',
10: 'mir-49.txt', 11: 'mir-50.txt', 12: 'mir-52-2.txt', 13: 'mir-57.txt', 14: 'mir-58b.txt',
15: 'mir-59.txt', 16: 'mir-60.txt', 17: 'mir-61.txt', 18: 'mir-62.txt', 19: 'mir-67.txt',
20: 'mir-70.txt', 21: 'mir-71.txt', 22: 'mir-72.txt', 23: 'mir-73.txt', 24: 'mir-75.txt',
25: 'mir-76.txt', 26: 'mir-77-2.txt', 27: 'mir-252.txt', 28: 'mir-253.txt', 29: 'mir-259.txt',
30: 'mir-356.txt', 31: 'mir-86.txt', 32: 'mir-87.txt', 33: 'mir-90.txt', 34: 'mir-124.txt',
35: 'mir-124b.txt', 36: 'mir-231.txt', 37: 'mir-232-1.txt', 38: 'mir-232-2.txt', 39: 'mir-233.txt',
40: 'mir-235.txt', 41: 'mir-236.txt', 42: 'mir-237.txt', 43: 'mir-238.txt', 44: 'mir-239a.txt',
45: 'mir-239b.txt', 46: 'mir-240.txt', 47: 'mir-241.txt', 48: 'mir-244.txt', 49: 'mir-245.txt',
50: 'mir-246-1.txt', 51: 'mir-246-2.txt', 52: 'mir-247.txt', 53: 'mir-248.txt', 54: 'mir-249.txt',
55: 'mir-2230.txt', 56: 'mir-2231.txt', 57: 'mir-2254.txt', 58: 'mir-7601.txt', 59: 'mir-7604.txt',
60: 'mir-7606.txt', 61: 'mir-7608.txt', 62: 'mir-784.txt', 63: 'mir-787.txt', 64: 'mir-788.txt',
65: 'mir-1817b.txt', 66: 'mir-2228.txt', 67: 'mir-35i.txt', 68: 'mir-38.txt', 69: 'mir-42.txt',
70: 'let-7.txt', 71: 'lin-4.txt', 72: 'mir-1.txt', 73: 'mir-2.txt', 74: 'mir-34.txt', 75: 'mir-84.txt'}
thetas = []
while N_repeats > 0:
# draw a random file from all upstream files
file_number = random.randint(0, len(filenames)-1)
# make a dictionnary with sequences in the file
alignment = convert_fasta(filenames[file_number])
# fix the case of all sequences and make sure gaps are '-'
for gene in alignment:
alignment[gene] = alignment[gene].upper()
alignment[gene] = alignment[gene].replace('~', '-')
for mirna in alignment:
if 'crm' in mirna:
reference = mirna
# pick a random position in the sequence and grab a motif of size Length
start = random.randint(0, len(alignment[reference])- Length)
stop = start + Length
N_polym = count_polymorphic_sites_motif(alignment, start, stop)
N_sites = len(alignment[reference][start:stop])
a = 0
N = len(alignment)
for i in range(1, N):
a += 1/i
theta_sequence = N_polym / a
for i in range(N_sites):
nucleotides = set()
excluded_sites = set()
for gene in alignment:
nucleotides.add(alignment[gene][i])
for base in nucleotides:
if base not in {'A', 'T', 'C', 'G'}:
excluded_sites.add(base)
if len(excluded_sites) != 0:
N_sites -= 1
if N_sites != 0:
theta = theta_sequence / N_sites
thetas.append(theta)
N_repeats -= 1
myfile = open(outputfile, 'w')
for theta in thetas:
myfile.write('random' + '\t' + str(theta) + '\n')
myfile.close()