forked from nextgenusfs/funannotate
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunannotate.py
executable file
·936 lines (807 loc) · 41.4 KB
/
funannotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
#!/usr/bin/env python
#Wrapper script for Funannotate package.
import sys, os, subprocess, inspect
from natsort import natsorted
script_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
import lib.library as lib
def flatten(l):
flatList = []
for elem in l:
# if an element of a list is a list
# iterate over this list and add elements to flatList
if type(elem) == list:
for e in elem:
flatList.append(e)
else:
flatList.append(elem)
return flatList
def fmtcols(mylist, cols):
justify = []
for i in range(0,cols):
length = max(map(lambda x: len(x), mylist[i::cols]))
length += 2
ljust = map(lambda x: x.ljust(length), mylist[i::cols])
justify.append(ljust)
justify = flatten(justify)
num_lines = len(mylist) / cols
lines = (' '.join(justify[i::num_lines])
for i in range(0,num_lines))
return "\n".join('{}'.format(lines))
try:
FUNDB = os.environ["FUNANNOTATE_DB"]
except KeyError:
pass
git_version = lib.git_version()
base_version = '1.6.0'
if git_version:
version = base_version+'-'+git_version
else:
version = base_version
default_help = """
Usage: funannotate <command> <arguments>
version: %s
Description: Funannotate is a genome prediction, annotation, and comparison pipeline.
Command: clean Find/remove small repetitive contigs
sort Sort by size and rename contig headers
mask Repeatmask genome assembly
train RNA-seq mediated training of Augustus/GeneMark
predict Run gene prediction pipeline
fix Fix annotation errors (generate new GenBank file)
update RNA-seq/PASA mediated gene model refinement
remote Partial functional annotation using remote servers
iprscan InterProScan5 search (Docker or local)
annotate Assign functional annotation to gene predictions
compare Compare funannotated genomes
util Format conversion and misc utilities
setup Setup/Install databases
test Download/Run funannotate installation tests
check Check Python, Perl, and External dependencies
species list pre-trained Augustus species
database Manage databases
outgroups Manage outgroups for funannotate compare
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % version
if len(sys.argv) > 1:
if sys.argv[1] == 'clean':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: The script sorts contigs by size, starting with shortest contigs it uses minimap2 or
Mummer to find contigs duplicated elsewhere, and then removes duplicated contigs.
Arguments: -i, --input Multi-fasta genome file (Required)
-o, --out Cleaned multi-fasta output file (Required)
-p, --pident Percent identity of overlap. Default = 95
-c, --cov Percent coverage of overlap. Default = 95
-m, --minlen Minimum length of contig to keep. Default = 500
--exhaustive Test every contig. Default is to stop at N50 value.
--method Program to use to calculate overlaps. Default: minimap2 [minimap2,mummer]
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'bin', 'funannotate-contig_cleaner.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'sort':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: This script sorts the input contigs by size (longest->shortest) and then relabels
the contigs with a simple name (e.g. scaffold_1). Augustus can have problems with
some complicated contig names.
Arguments: -i, --input Multi-fasta genome file. (Required)
-o, --output Sorted by size and relabeled output file. (Required)
-b, --base Base name to relabel contigs. Default: scaffold
--minlen Shorter contigs are discarded. Default: 0
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'bin', 'funannotate-sort_rename.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'mask':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: This script is a wrapper for RepeatMasker and RepeatModeler. It will generate
a softmasked genome. Dependencies are RepeatMasker and RepeatModeler.
Arguments: -i, --input Multi-FASTA genome file. (Required)
-o, --out Output softmasked FASTA file. (Required)
-s, --repeatmasker_species Species to use for RepeatMasker
-l, --repeatmodeler_lib Custom repeat database (FASTA format)
--cpus Number of cpus to use. Default: 2
--debug Keep intermediate files
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'bin', 'funannotate-mask.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'train':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Script is a wrapper for Illumina RNA-seq data through genome-guided Trinity
followed by PASA. Long-read (nanopore/pacbio) RNA-seq is also supported.
Dependencies are hisat2, Trinity, samtools, fasta, minimap2, blat, PASA.
Required: -i, --input Genome multi-fasta file
-o, --out Output folder name
-l, --left Left/Forward FASTQ Illumina reads (R1)
-r, --right Right/Reverse FASTQ Illumina reads (R2)
-s, --single Single ended FASTQ reads
Optional: --stranded If RNA-seq library stranded. [RF,FR,F,R,no]
--left_norm Normalized left FASTQ reads (R1)
--right_norm Normalized right FASTQ reads (R2)
--single_norm Normalized single-ended FASTQ reads
--pacbio_isoseq PacBio long-reads
--nanopore_cdna Nanopore cDNA long-reads
--nanopore_mrna Nanopore mRNA direct long-reads
--trinity Pre-computed Trinity transcripts (FASTA)
--jaccard_clip Turn on jaccard clip for dense genomes [Recommended for fungi]
--no_normalize_reads Skip read Normalization
--no_trimmomatic Skip Quality Trimming of reads
--memory RAM to use for Jellyfish. Default: 50G
-c, --coverage Depth to normalize reads. Default: 50
-m, --min_coverage Min depth for normalizing reads. Default: 5
--pasa_db Database to use. Default: sqlite [mysql,sqlite]
--pasa_alignment_overlap PASA --stringent_alignment_overlap. Default: 30.0
--max_intronlen Maximum intron length. Default: 3000
--species Species name, use quotes for binomial, e.g. "Aspergillus fumigatus"
--strain Strain name
--isolate Isolate name
--cpus Number of CPUs to use. Default: 2
ENV Vars: If not passed, will try to load from your $PATH.
--PASAHOME
--TRINITYHOME
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'bin', 'funannotate-train.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'predict':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Script takes genome multi-fasta file and a variety of inputs to do a comprehensive whole
genome gene prediction. Uses AUGUSTUS, GeneMark, BUSCO, BRAKER, EVidence Modeler,
tbl2asn, tRNAScan-SE, Exonerate, minimap2.
Required: -i, --input Genome multi-FASTA file (softmasked repeats)
-o, --out Output folder name
-s, --species Species name, use quotes for binomial, e.g. "Aspergillus fumigatus"
Optional: --isolate Isolate name, e.g. Af293
--strain Strain name, e.g. FGSCA4
--name Locus tag name (assigned by NCBI?). Default: FUN_
--numbering Specify where gene numbering starts. Default: 1
--maker_gff MAKER2 GFF file. Parse results directly to EVM.
--pasa_gff PASA generated gene models. filename:weight
--other_gff Annotation pass-through to EVM. filename:weight
--rna_bam RNA-seq mapped to genome to train Augustus/GeneMark-ET
--stringtie StringTie GTF result
-w, --weights Ab-initio predictor and EVM weight. Example: augustus:2 or pasa:10
--augustus_species Augustus species config. Default: uses species name
--min_training_models Minimum number of models to train Augustus. Default: 200
--genemark_mode GeneMark mode. Default: ES [ES,ET]
--genemark_mod GeneMark ini mod file
--busco_seed_species Augustus pre-trained species to start BUSCO. Default: anidulans
--optimize_augustus Run 'optimze_augustus.pl' to refine training (long runtime)
--busco_db BUSCO models. Default: dikarya. `funannotate outgroups --show_buscos`
--organism Fungal-specific options. Default: fungus. [fungus,other]
--ploidy Ploidy of assembly. Default: 1
-t, --tbl2asn Assembly parameters for tbl2asn. Default: "-l paired-ends"
-d, --database Path to funannotate database. Default: $FUNANNOTATE_DB
--protein_evidence Proteins to map to genome (prot1.fa prot2.fa uniprot.fa). Default: uniprot.fa
--protein_alignments Pre-computed protein alignments in GFF3 format
--transcript_evidence mRNA/ESTs to align to genome (trans1.fa ests.fa trinity.fa). Default: none
--transcript_alignments Pre-computed transcript alignments in GFF3 format
--augustus_gff Pre-computed AUGUSTUS GFF3 results (must use --stopCodonExcludedFromCDS=False)
--genemark_gtf Pre-computed GeneMark GTF results
--min_intronlen Minimum intron length. Default: 10
--max_intronlen Maximum intron length. Default: 3000
--soft_mask Softmasked length threshold for GeneMark. Default: 2000
--min_protlen Minimum protein length. Default: 50
--repeats2evm Use repeats in EVM consensus model building
--repeat_filter Repetitive gene model filtering. Default: overlap blast [overlap,blast,none]
--keep_no_stops Keep gene models without valid stops
--SeqCenter Sequencing facilty for NCBI tbl file. Default: CFMR
--SeqAccession Sequence accession number for NCBI tbl file. Default: 12345
--force Annotated unmasked genome
--cpus Number of CPUs to use. Default: 2
ENV Vars: If not specified at runtime, will be loaded from your $PATH
--EVM_HOME
--AUGUSTUS_CONFIG_PATH
--GENEMARK_PATH
--BAMTOOLS_PATH
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'bin', 'funannotate-predict.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'update':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Script will run PASA mediated update of gene models. It can directly update
the annotation from an NCBI downloaded GenBank file using RNA-seq data or can be
used after funannotate predict to refine UTRs and gene model predictions. Kallisto
is used to evidence filter most likely PASA gene models. Dependencies are
hisat2, Trinity, samtools, fasta, minimap2, blat, PASA, kallisto, bedtools.
Required: -i, --input Funannotate folder or Genome in GenBank format (.gbk,.gbff).
or
-f, --fasta Genome in FASTA format
-g, --gff Annotation in GFF3 format
--species Species name, use quotes for binomial, e.g. "Aspergillus fumigatus"
Optional: -o, --out Output folder name
-l, --left Left/Forward FASTQ Illumina reads (R1)
-r, --right Right/Reverse FASTQ Illumina reads (R2)
-s, --single Single ended FASTQ reads
--stranded If RNA-seq library stranded. [RF,FR,F,R,no]
--left_norm Normalized left FASTQ reads (R1)
--right_norm Normalized right FASTQ reads (R2)
--single_norm Normalized single-ended FASTQ reads
--pacbio_isoseq PacBio long-reads
--nanopore_cdna Nanopore cDNA long-reads
--nanopore_mrna Nanopore mRNA direct long-reads
--trinity Pre-computed Trinity transcripts (FASTA)
--jaccard_clip Turn on jaccard clip for dense genomes [Recommended for fungi]
--no_normalize_reads Skip read Normalization
--no_trimmomatic Skip Quality Trimming of reads
--memory RAM to use for Jellyfish. Default: 50G
-c, --coverage Depth to normalize reads. Default: 50
-m, --min_coverage Min depth for normalizing reads. Default: 5
--pasa_config PASA assembly config file, i.e. from previous PASA run
--pasa_db Database to use. Default: sqlite [mysql,sqlite]
--pasa_alignment_overlap PASA --stringent_alignment_overlap. Default: 30.0
--max_intronlen Maximum intron length. Default: 3000
--min_protlen Minimum protein length. Default: 50
--alt_transcripts Expression threshold (percent) to keep alt transcripts. Default: 0.1 [0-1]
--p2g NCBI p2g file (if updating NCBI annotation)
-t, --tbl2asn Assembly parameters for tbl2asn. Example: "-l paired-ends"
--name Locus tag name (assigned by NCBI?). Default: use existing
--sbt NCBI Submission file
--species Species name, use quotes for binomial, e.g. "Aspergillus fumigatus"
--strain Strain name
--isolate Isolate name
--SeqCenter Sequencing facilty for NCBI tbl file. Default: CFMR
--SeqAccession Sequence accession number for NCBI tbl file. Default: 12345
--cpus Number of CPUs to use. Default: 2
ENV Vars: If not passed, will try to load from your $PATH.
--PASAHOME
--TRINITYHOME
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'bin', 'funannotate-update.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'annotate':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Script functionally annotates the results from funannotate predict. It pulls
annotation from PFAM, InterPro, EggNog, UniProtKB, MEROPS, CAZyme, and GO ontology.
Required: -i, --input Folder from funannotate predict
or
--genbank Genome in GenBank format
-o, --out Output folder for results
or
--gff Genome GFF3 annotation file
--fasta Genome in multi-fasta format
-s, --species Species name, use quotes for binomial, e.g. "Aspergillus fumigatus"
-o, --out Output folder for results
Optional: --sbt NCBI submission template file. (Recommended)
-a, --annotations Custom annotations (3 column tsv file)
--eggnog Eggnog-mapper annotations file (if NOT installed)
--antismash antiSMASH secondary metabolism results (GBK file from output)
--iprscan InterProScan5 XML file
--phobius Phobius pre-computed results (if phobius NOT installed)
--isolate Isolate name
--strain Strain name
--rename Rename GFF gene models with locus_tag from NCBI.
--fix Gene/Product names fixed (TSV: GeneID\tName\tProduct)
--remove Gene/Product names to remove (TSV: Gene\tProduct)
--busco_db BUSCO models. Default: dikarya
-t, --tbl2asn Additional parameters for tbl2asn. Default: "-l paired-ends"
-d, --database Path to funannotate database. Default: $FUNANNOTATE_DB
--force Force over-write of output folder
--cpus Number of CPUs to use. Default: 2
ENV Vars: If not specified at runtime, will be loaded from your $PATH
--AUGUSTUS_CONFIG_PATH
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'bin', 'funannotate-functional.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'test':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: This is a script that runs several unit tests. It will download data and run
several different tests to determine if installion is functioning properly. If
you cannot download from the machine funannotate is installed at - then download
the 7 tar.gz files from https://osf.io/bj7v4/files/ and run script from directory
Arguments: -t, --tests Test sets to run. [all,clean,mask,predict,busco,rna-seq,annotate,compare]
--cpus Number of cpus to use. Default: 2
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'bin', 'funannotate-test.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'remote':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Script runs remote server functional annotation for Phobius, InterProScan5, and
antiSMASH (fungi). These searches are slow, if you can setup these services locally
it will be much faster to do that. PLEASE do not abuse services!
Required: -i, --input Funannotate input folder.
or
-g, --genbank GenBank file (must be annotated).
-o, --out Output folder name.
and
-m, --methods Which services to run, space separated [phobius,antismash,interproscan,all]
-e, --email Email address to identify yourself to services.
Optional: --force Force query even if antiSMASH server looks busy
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'bin', 'funannotate-remote.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'compare':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Script does light-weight comparative genomics between funannotated genomes. Output
is graphs, phylogeny, CSV files, etc --> visualized in web-browser.
Required: -i, --input List of funannotate genome folders or GBK files
Optional: -o, --out Output folder name. Default: funannotate_compare
-d, --database Path to funannotate database. Default: $FUNANNOTATE_DB
--cpus Number of CPUs to use. Default: 2
--run_dnds Calculate dN/dS ratio on all orthologs. [estimate,full]
--go_fdr P-value for FDR GO-enrichment. Default: 0.05
--heatmap_stdev Cut-off for heatmap. Default: 1.0
--num_orthos Number of Single-copy orthologs to use for ML. Default: 500
--bootstrap Number of boostrap replicates to run with RAxML. Default: 100
--outgroup Name of species to use for ML outgroup. Default: no outgroup
--proteinortho ProteinOrtho5 POFF results.
--ml_method Maxmimum Liklihood method: Default: raxml [raxml,iqtree]
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'bin', 'funannotate-compare.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'fix':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Script takes a GenBank genome annotation file and an NCBI tbl file to
generate updated annotation. Script is used to fix problematic gene models
after running funannotate predict.
Required: -i, --input Annotated genome in GenBank format.
-t, --tbl NCBI tbl annotation file.
-d, --drop Gene models to remove/drop from annotation. File with locus_tag 1 per line.
Optional: -o, --out Output folder
--tbl2asn Parameters for tbl2asn. Default: "-l paired-ends"
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'util', 'updateGBK.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'species':
try:
AUGUSTUS = os.environ["AUGUSTUS_CONFIG_PATH"]
except KeyError:
print("Error: Augustus is not properly configured. Please review installation instructions")
sys.exit(1)
#get the possible species from augustus
augustus_list = []
for i in os.listdir(os.path.join(AUGUSTUS, 'species')):
if not i.startswith('.'):
augustus_list.append(i)
augustus_list = set(augustus_list)
d = flatten(natsorted(augustus_list))
print('--------------------------')
print('AUGUSTUS species options:')
print('--------------------------')
print(lib.list_columns(d, cols=3))
sys.exit(1)
elif sys.argv[1] == 'check':
arguments = sys.argv[2:]
cmd = os.path.join(script_path, 'util', 'check_modules.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
sys.exit(1)
elif sys.argv[1] == 'setup':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Script will download/format necessary databases for funannotate.
Options: -i, --install Download format databases. Default: all
[merops,uniprot,dbCAN,pfam,repeats,go,
mibig,interpro,busco_outgroups,gene2product]
-b, --busco_db Busco Databases to install. Default: dikarya [all,fungi,aves,etc]
-d, --database Path to funannotate database
-u, --update Check remote md5 and update if newer version found
-f, --force Force overwriting database
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'bin', 'funannotate-setup.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'iprscan':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: This script is a wrapper for running InterProScan5 using Docker or from a
local installation. The script splits proteins into smaller chunks and then
launches several interproscan.sh "processes". It then combines the results.
Note if you are on a large cluster, you probably don't want to use this script
as likely the "cluster" mode of InterProScan5 will be faster.
Arguments: -i, --input Funannotate folder or FASTA protein file. (Required)
-m, --method Search method to use: [local, docker] (Required)
-n, --num Number of fasta files per chunk. Default: 1000
-o, --out Output XML InterProScan5 file
Docker arguments:
-c, --cpus Number of CPUs (total). Default: 12
--cpus_per_chunk Number of cpus per Docker instance. Default: 4
Local arguments:
--iprscan_path Full path to interproscan.sh (Required)
-c, --cpus Number of InterProScan instances to run
(configure cpu/thread control in interproscan.properties file)
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(script_path, 'util', 'funannotate-iprscan.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'database':
#setup funannotate DB path
try:
FUNDB = os.environ["FUNANNOTATE_DB"]
except KeyError:
print('$FUNANNOTATE_DB not found, run funannotate setup and export ENV variable')
sys.exit(1)
dbfile = os.path.join(FUNDB, 'funannotate-db-info.txt')
db_list = [['Database', 'Type', 'Version', 'Date', 'Num_Records', 'Md5checksum']]
if not os.path.isfile(dbfile):
print('Database is not properly configured, re-run funannotate setup')
sys.exit(1)
with open(dbfile, 'rU') as infile:
for line in infile:
line = line.rstrip()
cols = line.split('\t')
del cols[2]
db_list.append(cols)
msg=lib.bold_underline('Funannotate Databases currently installed:')
print('\n'+msg+'\n')
lib.print_table(db_list, alignments='LLLLRL', max_col_width=60)
print('\nTo update a database type:\n\tfunannotate setup -i DBNAME -d {:} --force\n'.format(FUNDB))
sys.exit(1)
elif sys.argv[1] == 'outgroups':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Managing the outgroups folder for funannotate compare
Arguments: -i, --input Proteome multi-fasta file. Required.
--species Species name for adding a species. Required.
--busco_db BUSCO db to use for --add. Default. dikarya
--cpus Number of CPUs to use for BUSCO search.
--show_buscos List the busco_db options
--show_outgroups List the installed outgroup species.
-d, --database Path to funannotate database. Default: $FUNANNOTATE_DB
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if '--show_outgroups' in arguments:
if '-d' in arguments:
FUNDB = arguments[arguments.index('-d')+1]
elif '--database' in arguments:
FUNDB = arguments[arguments.index('--database')+1]
if not FUNDB:
print('Funannotate database not configured, set ENV variable or pass -d.')
sys.exit(1)
try:
files = [f for f in os.listdir(os.path.join(FUNDB, 'outgroups'))]
except OSError:
print('ERROR: %s/outgroups folder is not found, run funannotate setup.' % FUNDB)
sys.exit(1)
files = [ x.replace('_buscos.fa', '') for x in files ]
files = [ x for x in files if not x.startswith('.') ]
print("-----------------------------")
print("BUSCO Outgroups:")
print("-----------------------------")
print(lib.list_columns(files, cols=3))
print('')
sys.exit(1)
elif '--show_buscos' in arguments:
print("-----------------------------")
print("BUSCO DB tree: (# of models)")
print("-----------------------------")
print(lib.buscoTree)
sys.exit(1)
elif len(arguments) > 1:
cmd = os.path.join(script_path, 'util', 'add2outgroups.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'util':
help = """
Usage: funannotate %s <arguments>
version: %s
Commands: compare Compare annotations to reference (GFF3 or GBK annotations)
tbl2gbk Convert TBL format to GenBank format
gbk2parts Convert GBK file to individual components
gff2proteins Convert GFF3 + FASTA files to protein FASTA
gff2tbl Convert GFF3 format to NCBI annotation table (tbl)
bam2gff3 Convert BAM coord-sorted transcript alignments to GFF3
prot2genome Map proteins to genome generating GFF3 protein alignments
stringtie2gff3 Convert GTF (stringTIE) to GFF3 format
quarry2gff3 Convert CodingQuarry output to proper GFF3 format
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 0:
subcmd = arguments[0]
if subcmd == 'gff2tbl':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Convert GFF3 file into NCBI tbl format. Tbl output to stdout.
Arguments: -g, --gff3 Reference Annotation. GFF3 format
-f, --fasta Genome FASTA file.
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = arguments[1:]
if len(arguments) > 0:
cmd = os.path.join(script_path, 'util', 'gff2tbl.py')
else:
print(help)
sys.exit(1)
elif subcmd == 'prot2genome':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Map proteins to genome using exonerate. Output is EVM compatible GFF3 file.
Arguments: -g, --genome Genome FASTA format (Required)
-p, --proteins Proteins FASTA format (Required)
-o, --out GFF3 output file (Required)
-f, --filter Pre-filtering method. Default: diamond [diamond,tblastn]
-t, --tblastn_out Output to save tblastn results. Default: off
--tblastn Use existing tblastn results
--ploidy Ploidy of assembly. Default: 1
--maxintron Max intron length. Default: 3000
--cpus Number of cpus to use. Default: 2
--EVM_HOME Location of Evidence Modeler home directory. Default: $EVM_HOME
--logfile Logfile output file
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = arguments[1:]
if len(arguments) > 0:
cmd = os.path.join(script_path, 'bin', 'funannotate-p2g.py')
else:
print(help)
sys.exit(1)
elif subcmd == 'gff2proteins':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Convert GFF3 file into NCBI tbl format. FASTA output to stdout.
Arguments: -g, --gff3 Reference Annotation. GFF3 format
-f, --fasta Genome FASTA file.
--no_stop Dont print stop codons
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = arguments[1:]
if len(arguments) > 0:
cmd = os.path.join(script_path, 'util', 'gff2prot.py')
else:
print(help)
sys.exit(1)
elif subcmd == 'gbk2parts':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Convert GenBank file to its individual components (parts) tbl, protein
FASTA, transcript FASTA, and contig/scaffold FASTA.
Arguments: -g, --gbk Input Genome in GenBank format
-o, --output Output basename
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = arguments[1:]
if len(arguments) > 0:
cmd = os.path.join(script_path, 'util', 'gbk2parts.py')
else:
print(help)
sys.exit(1)
elif subcmd == 'compare':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Compare annotations to reference. Annotations in either GBK or GFF3 format.
Arguments: -r, --reference Reference Annotation. GFF3 or GBK format
-f, --fasta Genome FASTA. Required if GFF3 used
-q, --query Annotation query. GFF3 or GBK format
-o, --output Output basename
-c, --calculate_pident Measure protein percent identity between query and reference
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = arguments[1:]
if len(arguments) > 0:
cmd = os.path.join(script_path, 'util', 'compare2annotations.py')
else:
print(help)
sys.exit(1)
elif subcmd == 'tbl2gbk':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Convert NCBI TBL annotations + Genome FASTA to GenBank format.
Required: -i, --tbl Annotation in NCBI tbl format
-f, --fasta Genome FASTA file.
-s, --species Species name, use quotes for binomial, e.g. "Aspergillus fumigatus"
Optional:
--isolate Isolate name
--strain Strain name
--sbt NCBI Submission Template file
-t, --tbl2asn Assembly parameters for tbl2asn. Example: "-l paired-ends"
-o, --output Output basename
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = arguments[1:]
if len(arguments) > 0:
cmd = os.path.join(script_path, 'util', 'tbl2gbk.py')
else:
print(help)
sys.exit(1)
elif subcmd == 'bam2gff3':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Convert BAM coordsorted transcript alignments to GFF3 format.
Arguments: -i, --bam BAM file (coord-sorted)
-o, --output GFF3 output file
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = arguments[1:]
if len(arguments) > 0:
cmd = os.path.join(script_path, 'util', 'bam2gff3.py')
else:
print(help)
sys.exit(1)
elif subcmd == 'stringtie2gff3':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Convert StringTIE GTF format to GFF3 funannotate compatible format. Output
to stdout.
Arguments: -i, --input GTF file from stringTIE
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = arguments[1:]
if len(arguments) > 0:
cmd = os.path.join(script_path, 'util', 'stringtie2gff.py')
else:
print(help)
sys.exit(1)
elif subcmd == 'quarry2gff3':
help = """
Usage: funannotate %s <arguments>
version: %s
Description: Convert CodingQuarry output GFF to proper GFF3 format. Output to stdout.
Arguments: -i, --input CodingQuarry output GFF file. (PredictedPass.gff3)
Written by Jon Palmer (2016-2018) nextgenusfs@gmail.com
""" % (sys.argv[1], version)
arguments = arguments[1:]
if len(arguments) > 0:
cmd = os.path.join(script_path, 'util', 'codingquarry2gff3.py')
else:
print(help)
sys.exit(1)
else:
print(help)
sys.exit(1)
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'version':
print("funannotate v%s" % version)
else:
print("%s option not recognized" % sys.argv[1])
print(default_help)
sys.exit(1)
else:
print(default_help)