Skip to content

Commit a23ff2b

Browse files
committed
added broccoli2orthomap.py
1 parent f329873 commit a23ff2b

File tree

4 files changed

+137
-94
lines changed

4 files changed

+137
-94
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ repository = "https://github.com/kullrich/oggmap"
6565

6666
[project.scripts]
6767
oggmap = "oggmap.__main__:main"
68+
broccoli2orthomap = "oggmap.broccoli2orthomap:main"
6869
cds2aa = "oggmap.cds2aa:main"
6970
eggnog2orthomap = "oggmap.eggnog2orthomap:main"
7071
plaza2orthomap = "oggmap.plaza2orthomap:main"

src/oggmap/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import sys
1515
import argparse
1616
from Bio import SeqIO
17-
from oggmap import cds2aa, eggnog2orthomap, gtf2t2g, ncbitax, of2orthomap, orthomcl2orthomap, plaza2orthomap, qlin
17+
from oggmap import broccoli2orthomap, cds2aa, eggnog2orthomap, gtf2t2g, ncbitax, of2orthomap, orthomcl2orthomap, plaza2orthomap, qlin
1818

1919

2020
def define_parser():

src/oggmap/broccoli2orthomap.py

Lines changed: 133 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import zipfile
1515
import argparse
1616
import pandas as pd
17-
from oggmap import qlin
17+
from oggmap import of2orthomap, qlin
1818

1919

2020
def define_parser():
@@ -25,7 +25,7 @@ def define_parser():
2525
2626
:rtype: argparse.ArgumentParser
2727
"""
28-
of2orthomap_example = '''broccoli2orthomap example:
28+
broccoli2orthomap_example = '''broccoli2orthomap example:
2929
3030
# download Broccoli example:
3131
$ wget https://zenodo.org/records/14935293/files/broccoli_example_table_OGs_protein_counts.txt
@@ -92,7 +92,7 @@ def get_broccoli_orthomap(seqname,
9292
ncbi=None,
9393
dbname=None):
9494
"""
95-
This function return an orthomap for a given query species and PLAZA gene family data.
95+
This function return an orthomap for a given query species and Broccoli input data.
9696
9797
:param qt: Query species taxID.
9898
:param sl: Path to species list file containing <Broccoli name><tab><species taxID>.
@@ -120,7 +120,20 @@ def get_broccoli_orthomap(seqname,
120120
121121
Example
122122
-------
123-
>>>
123+
>>> from oggmap import datasets, broccoli2orthomap, of2orthomap, qlin
124+
>>> datasets.broccoli_example(datapath='.')
125+
>>> query_orthomap, orthofinder_species_list, of_species_abundance = broccoli2orthomap.get_broccoli_orthomap(
126+
>>> seqname='proteome.selected_transcript.ath.fasta',
127+
>>> qt='3702',
128+
>>> sl='broccoli_example_species_list.tsv',
129+
>>> oc='broccoli_example_table_OGs_protein_counts.txt',
130+
>>> og='broccoli_example_table_OGs_protein_names.txt',
131+
>>> out=None,
132+
>>> quiet=False,
133+
>>> continuity=True,
134+
>>> overwrite=True,
135+
>>> dbname='taxadb.sqlite')
136+
>>> query_orthomap
124137
"""
125138
outhandle = None
126139
og_continuity_score = None
@@ -142,109 +155,128 @@ def get_broccoli_orthomap(seqname,
142155
sep='\t',
143156
header=None,
144157
comment='#')
145-
species_list.columns = ['species', 'common_name', 'tax_id', 'source', 'data_provider', 'pubmed_id']
146-
qt_species = list(species_list['species'][species_list['tax_id'] == int(qt)])
147-
if len(qt_species) == 0:
148-
print('\nError <-qt>: query species taxID not in PLAZA results, please check taxID.')
149-
sys.exit()
150-
ogs = pd.DataFrame(pd.read_csv(og,
151-
sep='\t',
152-
header=None,
153-
comment='#'))
154-
ogs.columns = ['gf_id', 'species', 'gene_id']
155-
ogs_grouped = ogs.groupby('gf_id')['species'].apply(set).apply(list).apply(_get_species_tax_id,
156-
species_list=species_list)
157-
ogs_grouped_qt = pd.DataFrame(ogs_grouped[ogs_grouped.apply(lambda x: int(qt) in x)])
158-
ogs_qt = ogs[ogs['gf_id'].isin(ogs_grouped_qt.index)]
159-
ogs_qt_red = ogs_qt[ogs_qt['species'].isin(qt_species)]
160-
ogs_qt_red_grouped = ogs_qt_red.groupby('gf_id')['gene_id'].apply(list)
161-
ogs_grouped_qt['gene_id'] = ogs_qt_red_grouped
162-
ogs_grouped_qt_species = np.sort(list(set([x[0] for x in ogs_grouped_qt['species'].to_dict().values()])))
163-
ogs_grouped_qt_species_names = [qlin.get_qlin(qt=x,
164-
quiet=True,
165-
ncbi=ncbi)[0] for x in ogs_grouped_qt_species]
166-
species_list_df = pd.DataFrame(ogs_grouped_qt_species_names,
167-
columns=['species'])
168-
species_list_df['taxID'] = ogs_grouped_qt_species
169-
species_list_df['lineage'] = species_list_df.apply(lambda x: qlin.ncbi_get_lineage(qt=x.iloc[1],
170-
ncbi=ncbi),
171-
axis=1)
172-
species_list_df['youngest_common'] = [qlin.get_youngest_common(qlineage,
173-
x) for x in species_list_df.lineage]
174-
species_list_df['youngest_name'] = [list(x.values())[0] for x in [qlin.ncbi_get_taxid_translator(qt_vec=[x],
175-
ncbi=ncbi)
176-
for x in list(species_list_df.youngest_common)]]
158+
species_list.columns = ['species', 'taxID']
159+
species_list['lineage'] = species_list.apply(lambda x: qlin.ncbi_get_lineage(qt=x.iloc[1],
160+
ncbi=ncbi),
161+
axis=1)
162+
species_list['youngest_common'] = [qlin.get_youngest_common(qlineage, x) for x in species_list.lineage]
163+
species_list['youngest_name'] = [list(x.values())[0] for x in [qlin.ncbi_get_taxid_translator(qt_vec=[x],
164+
ncbi=ncbi)
165+
for x in list(species_list.youngest_common)]]
177166
if not quiet:
167+
print(seqname)
178168
print(qname)
179169
print(qt)
180-
print(species_list_df)
170+
print(species_list)
181171
youngest_common_counts_df = of2orthomap.get_youngest_common_counts(qlineage,
182-
species_list_df)
172+
species_list)
183173
for node in qlin.traverse_postorder(query_lineage_topo.root):
184174
if node.name:
185175
nsplit = node.name.split('/')
186176
if len(nsplit) == 3:
187177
node.species_count = list(youngest_common_counts_df[youngest_common_counts_df.PStaxID.isin(
188178
[int(nsplit[1])])].counts)[0]
189-
#for node in query_lineage_topo.traverse('postorder'):
190-
# nsplit = node.name.split('/')
191-
# if len(nsplit) == 3:
192-
# node.add_feature('species_count',
193-
# list(youngest_common_counts_df[youngest_common_counts_df.PStaxID.isin(
194-
# [int(nsplit[1])])].counts)[0])
195-
og_dict = {}
179+
oc_og_dict = {}
196180
continuity_dict = {}
197-
for og in ogs_grouped_qt.index:
198-
og_hits = np.sort(
199-
list(set(list(ogs_grouped_qt[ogs_grouped_qt.index.isin([og])]['species'].to_dict().values())[0])))
200-
# get list of the youngest common between query and all other species
201-
og_hits_youngest_common = list(species_list_df.youngest_common[
202-
[x for x, y in enumerate(species_list_df.taxID)
203-
if y in og_hits]])
204-
# evaluate all youngest common nodes to retain the oldest of them and assign as the orthogroup
205-
# ancestral state (gene age)
206-
if len(og_hits_youngest_common) > 0:
207-
og_oldest_common = qlin.get_oldest_common(qlineage,
208-
og_hits_youngest_common)
209-
og_dict[og] = og_oldest_common
210-
if continuity:
211-
continuity_dict[og] = \
212-
of2orthomap.get_youngest_common_counts(qlineage,
213-
pd.DataFrame(og_hits_youngest_common,
214-
columns=['youngest_common'])).counts
181+
if os.path.basename(oc).split('.')[-1] == 'zip':
182+
oc_zip = zipfile.Path(oc, at='.'.join(os.path.basename(oc).split('.')[:-1]))
183+
oc_lines = oc_zip.open()
184+
else:
185+
oc_lines = open(oc,
186+
'r')
187+
oc_species = next(oc_lines)
188+
if type(oc_species) == bytes:
189+
oc_species = oc_species.decode('utf-8').strip().split('\t')
190+
else:
191+
oc_species = oc_species.strip().split('\t')
192+
oc_qidx = [x for x, y in enumerate(oc_species) if y == seqname]
193+
if len(oc_qidx) == 0:
194+
print('\nError <-qname>: query species name not in Broccoli results, please check spelling\n'
195+
'e.g. <head -1 table_OGs_protein_counts.txt>')
196+
sys.exit()
197+
for oc_line in oc_lines:
198+
if type(oc_line) == bytes:
199+
oc_og = oc_line.decode('utf-8').strip().split('\t')
200+
else:
201+
oc_og = oc_line.strip().split('\t')
202+
if int(oc_og[oc_qidx[0]]) == 0:
203+
continue
204+
if int(oc_og[oc_qidx[0]]) > 0:
205+
oc_og_hits = [oc_species[x+1] for x, y in enumerate(oc_og[1::][::-1][1::][::-1]) if int(y) > 0]
206+
# get list of the youngest common between query and all other species
207+
oc_og_hits_youngest_common = list(species_list.youngest_common[
208+
[x for x, y in enumerate(species_list.species)
209+
if y in oc_og_hits]])
210+
# evaluate all youngest common nodes to retain the oldest of them and assign as the orthogroup
211+
# ancestral state (gene age)
212+
if len(oc_og_hits_youngest_common) > 0:
213+
oc_og_oldest_common = qlin.get_oldest_common(qlineage,
214+
oc_og_hits_youngest_common)
215+
oc_og_dict[oc_og[0]] = oc_og_oldest_common
216+
if continuity:
217+
continuity_dict[oc_og[0]] = of2orthomap.get_youngest_common_counts(
218+
qlineage,
219+
pd.DataFrame(oc_og_hits_youngest_common,
220+
columns=['youngest_common'])).counts
221+
oc_lines.close()
215222
if continuity:
216223
youngest_common_counts_df = youngest_common_counts_df.join(pd.DataFrame.from_dict(continuity_dict))
217224
omap = []
218225
if out:
219226
if os.path.exists(out) and not overwrite:
220227
print('\nError <-overwrite>: output file exists, please set to True if it should be overwritten\n')
221228
sys.exit()
222-
outhandle = open(out, 'w')
229+
outhandle = open(out,
230+
'w')
223231
if continuity:
224232
outhandle.write('seqID\tOrthogroup\tPSnum\tPStaxID\tPSname\tPScontinuity\n')
225233
else:
226234
outhandle.write('seqID\tOrthogroup\tPSnum\tPStaxID\tPSname\n')
227-
for og in ogs_grouped_qt.index:
228-
og_tmp = ogs_grouped_qt[ogs_grouped_qt.index.isin([og])]
229-
og_ps = qlineagenames[qlineagenames['PStaxID'] ==
230-
str(og_dict[og])].values.tolist()[0]
231-
og_ps_join = '\t'.join(og_ps)
232-
if continuity:
233-
og_continuity_score = of2orthomap.get_continuity_score(og,
234-
youngest_common_counts_df)
235+
if os.path.basename(og).split('.')[-1] == 'zip':
236+
og_zip = zipfile.Path(og,
237+
at='.'.join(os.path.basename(og).split('.')[:-1]))
238+
og_lines = og_zip.open()
239+
else:
240+
og_lines = open(og,
241+
'r')
242+
og_species = next(og_lines)
243+
if type(og_species) == bytes:
244+
og_species = og_species.decode('utf-8').strip().split('\t')
245+
else:
246+
og_species = og_species.strip().split('\t')
247+
og_qidx = [x for x, y in enumerate(og_species) if y == seqname]
248+
if len(oc_qidx) == 0:
249+
print('\nError <-qname>: query species name not in Broccoli results, please check spelling\n'
250+
'e.g. <head -1 table_OGs_protein_counts.txt>')
251+
sys.exit()
252+
for og_line in og_lines:
253+
if type(og_line) == bytes:
254+
og_og = og_line.decode('utf-8').strip().split('\t')
255+
else:
256+
og_og = og_line.strip().split('\t')
257+
if og_og[0] not in oc_og_dict:
258+
continue
259+
else:
260+
og_ps = qlineagenames[qlineagenames['PStaxID'] ==
261+
str(oc_og_dict[og_og[0]])].values.tolist()[0]
262+
og_ps_join = '\t'.join(og_ps)
263+
if continuity:
264+
og_continuity_score = of2orthomap.get_continuity_score(og_name=og_og[0],
265+
youngest_common_counts_df=youngest_common_counts_df)
235266
if out:
236267
if continuity:
237-
[outhandle.write(x.replace(' ', '') + '\t' + og + '\t' + og_ps_join + '\t' +
238-
str(og_continuity_score) + '\n') for x in list(og_tmp['gene_id'])[0]]
268+
[outhandle.write(x.replace(' ', '') + '\t' + og_og[0] + '\t' + og_ps_join + '\t' +
269+
str(og_continuity_score) + '\n') for x in og_og[og_qidx[0]].split(',')]
239270
else:
240-
[outhandle.write(x.replace(' ', '') + '\t' + og + '\t' + og_ps_join + '\n')
241-
for x in list(og_tmp['gene_id'])[0]]
271+
[outhandle.write(x.replace(' ', '') + '\t' + og_og[0] + '\t' + og_ps_join + '\n')
272+
for x in og_og[og_qidx[0]].split(',')]
242273
if continuity:
243-
omap += [[x.replace(' ', ''), og, og_ps[0], og_ps[1], og_ps[2], og_continuity_score]
244-
for x in list(og_tmp['gene_id'])[0]]
274+
omap += [[x.replace(' ', ''), og_og[0], og_ps[0], og_ps[1], og_ps[2], og_continuity_score]
275+
for x in og_og[og_qidx[0]].split(',')]
245276
else:
246-
omap += [[x.replace(' ', ''), og, og_ps[0], og_ps[1], og_ps[2]]
247-
for x in list(og_tmp['gene_id'])[0]]
277+
omap += [[x.replace(' ', ''), og_og[0], og_ps[0], og_ps[1], og_ps[2]]
278+
for x in og_og[og_qidx[0]].split(',')]
279+
og_lines.close()
248280
if out:
249281
outhandle.close()
250282
omap_df = pd.DataFrame(omap)
@@ -263,7 +295,7 @@ def get_broccoli_orthomap(seqname,
263295
'PSname']
264296
omap_df['PSnum'] = [int(x) for x in list(omap_df['PSnum'])]
265297
return [omap_df,
266-
species_list_df,
298+
species_list,
267299
youngest_common_counts_df]
268300

269301

@@ -277,26 +309,36 @@ def main():
277309
if not args.dbname:
278310
print('\nError <-dbname>: Please specify taxadb.sqlite file')
279311
sys.exit()
312+
if not args.seqname:
313+
parser.print_help()
314+
print('\nError <-seqname>: Please specify query species name in Broccoli and taxID')
315+
sys.exit()
280316
if not args.qt:
281317
parser.print_help()
282318
print('\nError <-qt>: Please specify query species taxID')
283319
sys.exit()
284320
if not args.sl:
285321
parser.print_help()
286-
print('\nError <-sl>: Please specify PLAZA species information file <species_information.csv>')
322+
print('\nError <-sl>: Please specify species list as <Broccoli name><tab><species taxID>')
323+
sys.exit()
324+
if not args.oc:
325+
parser.print_help()
326+
print('\nError <-oc>: Please specify Broccoli <table_OGs_protein_counts.txt> (see dir_step3 directory)')
287327
sys.exit()
288328
if not args.og:
289329
parser.print_help()
290-
print('\nError <-og>: Please specify PLAZA gene family file <genefamily_data.ORTHOFAM.csv> or '
291-
'<genefamily_data.HOMFAM.csv>')
330+
print('\nError <-og>: Please specify Broccoli <table_OGs_protein_names.txt> (see dir_step3 directory)')
292331
sys.exit()
293-
get_plaza_orthomap(seqname=args.seqname,
294-
qt=args.qt,
295-
sl=args.sl,
296-
og=args.og,
297-
out=args.out,
298-
overwrite=args.overwrite,
299-
dbname=args.dbname)
332+
get_broccoli_orthomap(seqname=args.seqname,
333+
qt=args.qt,
334+
sl=args.sl,
335+
oc=args.oc,
336+
og=args.og,
337+
out=args.out,
338+
quiet=False,
339+
continuity=True,
340+
overwrite=args.overwrite,
341+
dbname=args.dbname)
300342

301343

302344
if __name__ == '__main__':

src/oggmap/of2orthomap.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def get_orthomap(seqname,
198198
oc_species = oc_species.strip().split('\t')
199199
oc_qidx = [x for x, y in enumerate(oc_species) if y == seqname]
200200
if len(oc_qidx) == 0:
201-
print('\nError <-qname>: query species name not in orthofinder results, please check spelling\n'
201+
print('\nError <-qname>: query species name not in OrthoFinder results, please check spelling\n'
202202
'e.g. <head -1 Orthogroups.GeneCounts.tsv>')
203203
sys.exit()
204204
for oc_line in oc_lines:
@@ -253,7 +253,7 @@ def get_orthomap(seqname,
253253
og_species = og_species.strip().split('\t')
254254
og_qidx = [x for x, y in enumerate(og_species) if y == seqname]
255255
if len(oc_qidx) == 0:
256-
print('\nError <-qname>: query species name not in orthofinder results, please check spelling\n'
256+
print('\nError <-qname>: query species name not in OrthoFinder results, please check spelling\n'
257257
'e.g. <head -1 Orthogroups.tsv>')
258258
sys.exit()
259259
for og_line in og_lines:

0 commit comments

Comments
 (0)