14
14
import zipfile
15
15
import argparse
16
16
import pandas as pd
17
- from oggmap import qlin
17
+ from oggmap import of2orthomap , qlin
18
18
19
19
20
20
def define_parser ():
@@ -25,7 +25,7 @@ def define_parser():
25
25
26
26
:rtype: argparse.ArgumentParser
27
27
"""
28
- of2orthomap_example = '''broccoli2orthomap example:
28
+ broccoli2orthomap_example = '''broccoli2orthomap example:
29
29
30
30
# download Broccoli example:
31
31
$ wget https://zenodo.org/records/14935293/files/broccoli_example_table_OGs_protein_counts.txt
@@ -92,7 +92,7 @@ def get_broccoli_orthomap(seqname,
92
92
ncbi = None ,
93
93
dbname = None ):
94
94
"""
95
- This function return an orthomap for a given query species and PLAZA gene family data.
95
+ This function return an orthomap for a given query species and Broccoli input data.
96
96
97
97
:param qt: Query species taxID.
98
98
:param sl: Path to species list file containing <Broccoli name><tab><species taxID>.
@@ -120,7 +120,20 @@ def get_broccoli_orthomap(seqname,
120
120
121
121
Example
122
122
-------
123
- >>>
123
+ >>> from oggmap import datasets, broccoli2orthomap, of2orthomap, qlin
124
+ >>> datasets.broccoli_example(datapath='.')
125
+ >>> query_orthomap, orthofinder_species_list, of_species_abundance = broccoli2orthomap.get_broccoli_orthomap(
126
+ >>> seqname='proteome.selected_transcript.ath.fasta',
127
+ >>> qt='3702',
128
+ >>> sl='broccoli_example_species_list.tsv',
129
+ >>> oc='broccoli_example_table_OGs_protein_counts.txt',
130
+ >>> og='broccoli_example_table_OGs_protein_names.txt',
131
+ >>> out=None,
132
+ >>> quiet=False,
133
+ >>> continuity=True,
134
+ >>> overwrite=True,
135
+ >>> dbname='taxadb.sqlite')
136
+ >>> query_orthomap
124
137
"""
125
138
outhandle = None
126
139
og_continuity_score = None
@@ -142,109 +155,128 @@ def get_broccoli_orthomap(seqname,
142
155
sep = '\t ' ,
143
156
header = None ,
144
157
comment = '#' )
145
- species_list .columns = ['species' , 'common_name' , 'tax_id' , 'source' , 'data_provider' , 'pubmed_id' ]
146
- qt_species = list (species_list ['species' ][species_list ['tax_id' ] == int (qt )])
147
- if len (qt_species ) == 0 :
148
- print ('\n Error <-qt>: query species taxID not in PLAZA results, please check taxID.' )
149
- sys .exit ()
150
- ogs = pd .DataFrame (pd .read_csv (og ,
151
- sep = '\t ' ,
152
- header = None ,
153
- comment = '#' ))
154
- ogs .columns = ['gf_id' , 'species' , 'gene_id' ]
155
- ogs_grouped = ogs .groupby ('gf_id' )['species' ].apply (set ).apply (list ).apply (_get_species_tax_id ,
156
- species_list = species_list )
157
- ogs_grouped_qt = pd .DataFrame (ogs_grouped [ogs_grouped .apply (lambda x : int (qt ) in x )])
158
- ogs_qt = ogs [ogs ['gf_id' ].isin (ogs_grouped_qt .index )]
159
- ogs_qt_red = ogs_qt [ogs_qt ['species' ].isin (qt_species )]
160
- ogs_qt_red_grouped = ogs_qt_red .groupby ('gf_id' )['gene_id' ].apply (list )
161
- ogs_grouped_qt ['gene_id' ] = ogs_qt_red_grouped
162
- ogs_grouped_qt_species = np .sort (list (set ([x [0 ] for x in ogs_grouped_qt ['species' ].to_dict ().values ()])))
163
- ogs_grouped_qt_species_names = [qlin .get_qlin (qt = x ,
164
- quiet = True ,
165
- ncbi = ncbi )[0 ] for x in ogs_grouped_qt_species ]
166
- species_list_df = pd .DataFrame (ogs_grouped_qt_species_names ,
167
- columns = ['species' ])
168
- species_list_df ['taxID' ] = ogs_grouped_qt_species
169
- species_list_df ['lineage' ] = species_list_df .apply (lambda x : qlin .ncbi_get_lineage (qt = x .iloc [1 ],
170
- ncbi = ncbi ),
171
- axis = 1 )
172
- species_list_df ['youngest_common' ] = [qlin .get_youngest_common (qlineage ,
173
- x ) for x in species_list_df .lineage ]
174
- species_list_df ['youngest_name' ] = [list (x .values ())[0 ] for x in [qlin .ncbi_get_taxid_translator (qt_vec = [x ],
175
- ncbi = ncbi )
176
- for x in list (species_list_df .youngest_common )]]
158
+ species_list .columns = ['species' , 'taxID' ]
159
+ species_list ['lineage' ] = species_list .apply (lambda x : qlin .ncbi_get_lineage (qt = x .iloc [1 ],
160
+ ncbi = ncbi ),
161
+ axis = 1 )
162
+ species_list ['youngest_common' ] = [qlin .get_youngest_common (qlineage , x ) for x in species_list .lineage ]
163
+ species_list ['youngest_name' ] = [list (x .values ())[0 ] for x in [qlin .ncbi_get_taxid_translator (qt_vec = [x ],
164
+ ncbi = ncbi )
165
+ for x in list (species_list .youngest_common )]]
177
166
if not quiet :
167
+ print (seqname )
178
168
print (qname )
179
169
print (qt )
180
- print (species_list_df )
170
+ print (species_list )
181
171
youngest_common_counts_df = of2orthomap .get_youngest_common_counts (qlineage ,
182
- species_list_df )
172
+ species_list )
183
173
for node in qlin .traverse_postorder (query_lineage_topo .root ):
184
174
if node .name :
185
175
nsplit = node .name .split ('/' )
186
176
if len (nsplit ) == 3 :
187
177
node .species_count = list (youngest_common_counts_df [youngest_common_counts_df .PStaxID .isin (
188
178
[int (nsplit [1 ])])].counts )[0 ]
189
- #for node in query_lineage_topo.traverse('postorder'):
190
- # nsplit = node.name.split('/')
191
- # if len(nsplit) == 3:
192
- # node.add_feature('species_count',
193
- # list(youngest_common_counts_df[youngest_common_counts_df.PStaxID.isin(
194
- # [int(nsplit[1])])].counts)[0])
195
- og_dict = {}
179
+ oc_og_dict = {}
196
180
continuity_dict = {}
197
- for og in ogs_grouped_qt .index :
198
- og_hits = np .sort (
199
- list (set (list (ogs_grouped_qt [ogs_grouped_qt .index .isin ([og ])]['species' ].to_dict ().values ())[0 ])))
200
- # get list of the youngest common between query and all other species
201
- og_hits_youngest_common = list (species_list_df .youngest_common [
202
- [x for x , y in enumerate (species_list_df .taxID )
203
- if y in og_hits ]])
204
- # evaluate all youngest common nodes to retain the oldest of them and assign as the orthogroup
205
- # ancestral state (gene age)
206
- if len (og_hits_youngest_common ) > 0 :
207
- og_oldest_common = qlin .get_oldest_common (qlineage ,
208
- og_hits_youngest_common )
209
- og_dict [og ] = og_oldest_common
210
- if continuity :
211
- continuity_dict [og ] = \
212
- of2orthomap .get_youngest_common_counts (qlineage ,
213
- pd .DataFrame (og_hits_youngest_common ,
214
- columns = ['youngest_common' ])).counts
181
+ if os .path .basename (oc ).split ('.' )[- 1 ] == 'zip' :
182
+ oc_zip = zipfile .Path (oc , at = '.' .join (os .path .basename (oc ).split ('.' )[:- 1 ]))
183
+ oc_lines = oc_zip .open ()
184
+ else :
185
+ oc_lines = open (oc ,
186
+ 'r' )
187
+ oc_species = next (oc_lines )
188
+ if type (oc_species ) == bytes :
189
+ oc_species = oc_species .decode ('utf-8' ).strip ().split ('\t ' )
190
+ else :
191
+ oc_species = oc_species .strip ().split ('\t ' )
192
+ oc_qidx = [x for x , y in enumerate (oc_species ) if y == seqname ]
193
+ if len (oc_qidx ) == 0 :
194
+ print ('\n Error <-qname>: query species name not in Broccoli results, please check spelling\n '
195
+ 'e.g. <head -1 table_OGs_protein_counts.txt>' )
196
+ sys .exit ()
197
+ for oc_line in oc_lines :
198
+ if type (oc_line ) == bytes :
199
+ oc_og = oc_line .decode ('utf-8' ).strip ().split ('\t ' )
200
+ else :
201
+ oc_og = oc_line .strip ().split ('\t ' )
202
+ if int (oc_og [oc_qidx [0 ]]) == 0 :
203
+ continue
204
+ if int (oc_og [oc_qidx [0 ]]) > 0 :
205
+ oc_og_hits = [oc_species [x + 1 ] for x , y in enumerate (oc_og [1 ::][::- 1 ][1 ::][::- 1 ]) if int (y ) > 0 ]
206
+ # get list of the youngest common between query and all other species
207
+ oc_og_hits_youngest_common = list (species_list .youngest_common [
208
+ [x for x , y in enumerate (species_list .species )
209
+ if y in oc_og_hits ]])
210
+ # evaluate all youngest common nodes to retain the oldest of them and assign as the orthogroup
211
+ # ancestral state (gene age)
212
+ if len (oc_og_hits_youngest_common ) > 0 :
213
+ oc_og_oldest_common = qlin .get_oldest_common (qlineage ,
214
+ oc_og_hits_youngest_common )
215
+ oc_og_dict [oc_og [0 ]] = oc_og_oldest_common
216
+ if continuity :
217
+ continuity_dict [oc_og [0 ]] = of2orthomap .get_youngest_common_counts (
218
+ qlineage ,
219
+ pd .DataFrame (oc_og_hits_youngest_common ,
220
+ columns = ['youngest_common' ])).counts
221
+ oc_lines .close ()
215
222
if continuity :
216
223
youngest_common_counts_df = youngest_common_counts_df .join (pd .DataFrame .from_dict (continuity_dict ))
217
224
omap = []
218
225
if out :
219
226
if os .path .exists (out ) and not overwrite :
220
227
print ('\n Error <-overwrite>: output file exists, please set to True if it should be overwritten\n ' )
221
228
sys .exit ()
222
- outhandle = open (out , 'w' )
229
+ outhandle = open (out ,
230
+ 'w' )
223
231
if continuity :
224
232
outhandle .write ('seqID\t Orthogroup\t PSnum\t PStaxID\t PSname\t PScontinuity\n ' )
225
233
else :
226
234
outhandle .write ('seqID\t Orthogroup\t PSnum\t PStaxID\t PSname\n ' )
227
- for og in ogs_grouped_qt .index :
228
- og_tmp = ogs_grouped_qt [ogs_grouped_qt .index .isin ([og ])]
229
- og_ps = qlineagenames [qlineagenames ['PStaxID' ] ==
230
- str (og_dict [og ])].values .tolist ()[0 ]
231
- og_ps_join = '\t ' .join (og_ps )
232
- if continuity :
233
- og_continuity_score = of2orthomap .get_continuity_score (og ,
234
- youngest_common_counts_df )
235
+ if os .path .basename (og ).split ('.' )[- 1 ] == 'zip' :
236
+ og_zip = zipfile .Path (og ,
237
+ at = '.' .join (os .path .basename (og ).split ('.' )[:- 1 ]))
238
+ og_lines = og_zip .open ()
239
+ else :
240
+ og_lines = open (og ,
241
+ 'r' )
242
+ og_species = next (og_lines )
243
+ if type (og_species ) == bytes :
244
+ og_species = og_species .decode ('utf-8' ).strip ().split ('\t ' )
245
+ else :
246
+ og_species = og_species .strip ().split ('\t ' )
247
+ og_qidx = [x for x , y in enumerate (og_species ) if y == seqname ]
248
+ if len (oc_qidx ) == 0 :
249
+ print ('\n Error <-qname>: query species name not in Broccoli results, please check spelling\n '
250
+ 'e.g. <head -1 table_OGs_protein_counts.txt>' )
251
+ sys .exit ()
252
+ for og_line in og_lines :
253
+ if type (og_line ) == bytes :
254
+ og_og = og_line .decode ('utf-8' ).strip ().split ('\t ' )
255
+ else :
256
+ og_og = og_line .strip ().split ('\t ' )
257
+ if og_og [0 ] not in oc_og_dict :
258
+ continue
259
+ else :
260
+ og_ps = qlineagenames [qlineagenames ['PStaxID' ] ==
261
+ str (oc_og_dict [og_og [0 ]])].values .tolist ()[0 ]
262
+ og_ps_join = '\t ' .join (og_ps )
263
+ if continuity :
264
+ og_continuity_score = of2orthomap .get_continuity_score (og_name = og_og [0 ],
265
+ youngest_common_counts_df = youngest_common_counts_df )
235
266
if out :
236
267
if continuity :
237
- [outhandle .write (x .replace (' ' , '' ) + '\t ' + og + '\t ' + og_ps_join + '\t ' +
238
- str (og_continuity_score ) + '\n ' ) for x in list ( og_tmp [ 'gene_id' ]) [0 ]]
268
+ [outhandle .write (x .replace (' ' , '' ) + '\t ' + og_og [ 0 ] + '\t ' + og_ps_join + '\t ' +
269
+ str (og_continuity_score ) + '\n ' ) for x in og_og [ og_qidx [0 ]]. split ( ',' ) ]
239
270
else :
240
- [outhandle .write (x .replace (' ' , '' ) + '\t ' + og + '\t ' + og_ps_join + '\n ' )
241
- for x in list ( og_tmp [ 'gene_id' ]) [0 ]]
271
+ [outhandle .write (x .replace (' ' , '' ) + '\t ' + og_og [ 0 ] + '\t ' + og_ps_join + '\n ' )
272
+ for x in og_og [ og_qidx [0 ]]. split ( ',' ) ]
242
273
if continuity :
243
- omap += [[x .replace (' ' , '' ), og , og_ps [0 ], og_ps [1 ], og_ps [2 ], og_continuity_score ]
244
- for x in list ( og_tmp [ 'gene_id' ]) [0 ]]
274
+ omap += [[x .replace (' ' , '' ), og_og [ 0 ] , og_ps [0 ], og_ps [1 ], og_ps [2 ], og_continuity_score ]
275
+ for x in og_og [ og_qidx [0 ]]. split ( ',' ) ]
245
276
else :
246
- omap += [[x .replace (' ' , '' ), og , og_ps [0 ], og_ps [1 ], og_ps [2 ]]
247
- for x in list (og_tmp ['gene_id' ])[0 ]]
277
+ omap += [[x .replace (' ' , '' ), og_og [0 ], og_ps [0 ], og_ps [1 ], og_ps [2 ]]
278
+ for x in og_og [og_qidx [0 ]].split (',' )]
279
+ og_lines .close ()
248
280
if out :
249
281
outhandle .close ()
250
282
omap_df = pd .DataFrame (omap )
@@ -263,7 +295,7 @@ def get_broccoli_orthomap(seqname,
263
295
'PSname' ]
264
296
omap_df ['PSnum' ] = [int (x ) for x in list (omap_df ['PSnum' ])]
265
297
return [omap_df ,
266
- species_list_df ,
298
+ species_list ,
267
299
youngest_common_counts_df ]
268
300
269
301
@@ -277,26 +309,36 @@ def main():
277
309
if not args .dbname :
278
310
print ('\n Error <-dbname>: Please specify taxadb.sqlite file' )
279
311
sys .exit ()
312
+ if not args .seqname :
313
+ parser .print_help ()
314
+ print ('\n Error <-seqname>: Please specify query species name in Broccoli and taxID' )
315
+ sys .exit ()
280
316
if not args .qt :
281
317
parser .print_help ()
282
318
print ('\n Error <-qt>: Please specify query species taxID' )
283
319
sys .exit ()
284
320
if not args .sl :
285
321
parser .print_help ()
286
- print ('\n Error <-sl>: Please specify PLAZA species information file <species_information.csv>' )
322
+ print ('\n Error <-sl>: Please specify species list as <Broccoli name><tab><species taxID>' )
323
+ sys .exit ()
324
+ if not args .oc :
325
+ parser .print_help ()
326
+ print ('\n Error <-oc>: Please specify Broccoli <table_OGs_protein_counts.txt> (see dir_step3 directory)' )
287
327
sys .exit ()
288
328
if not args .og :
289
329
parser .print_help ()
290
- print ('\n Error <-og>: Please specify PLAZA gene family file <genefamily_data.ORTHOFAM.csv> or '
291
- '<genefamily_data.HOMFAM.csv>' )
330
+ print ('\n Error <-og>: Please specify Broccoli <table_OGs_protein_names.txt> (see dir_step3 directory)' )
292
331
sys .exit ()
293
- get_plaza_orthomap (seqname = args .seqname ,
294
- qt = args .qt ,
295
- sl = args .sl ,
296
- og = args .og ,
297
- out = args .out ,
298
- overwrite = args .overwrite ,
299
- dbname = args .dbname )
332
+ get_broccoli_orthomap (seqname = args .seqname ,
333
+ qt = args .qt ,
334
+ sl = args .sl ,
335
+ oc = args .oc ,
336
+ og = args .og ,
337
+ out = args .out ,
338
+ quiet = False ,
339
+ continuity = True ,
340
+ overwrite = args .overwrite ,
341
+ dbname = args .dbname )
300
342
301
343
302
344
if __name__ == '__main__' :
0 commit comments