From b6f0960cf89778d66d38668a044f962c6afef089 Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Mon, 18 Nov 2019 15:04:19 +0100
Subject: [PATCH 01/15] Add draft for turning off loading of citations.

---
 mpwt/__init__.py      |  2 +-
 mpwt/__main__.py      |  5 ++++-
 mpwt/mpwt_workflow.py | 15 ++++++++++++++-
 mpwt/utils.py         | 16 +++++++++++++++-
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/mpwt/__init__.py b/mpwt/__init__.py
index 151de1e..08e2354 100755
--- a/mpwt/__init__.py
+++ b/mpwt/__init__.py
@@ -1,3 +1,3 @@
 from mpwt.pwt_wrapper import run_pwt, run_pwt_dat
 from mpwt.mpwt_workflow import multiprocess_pwt
-from mpwt.utils import cleaning, cleaning_input, find_ptools_path, list_pgdb, remove_pgdbs
+from mpwt.utils import cleaning, cleaning_input, find_ptools_path, list_pgdb, pubmed_citations, remove_pgdbs
diff --git a/mpwt/__main__.py b/mpwt/__main__.py
index c35fc3a..f819fa7 100755
--- a/mpwt/__main__.py
+++ b/mpwt/__main__.py
@@ -7,7 +7,7 @@
 The script takes a folder name as argument.
 
 usage:
-    mpwt -f=DIR [-o=DIR] [--patho] [--hf] [--dat] [--md] [--cpu=INT] [-r] [-v] [--clean] [--log=FOLDER] [--ignore-error] [--taxon-file]
+    mpwt -f=DIR [-o=DIR] [--patho] [--hf] [--dat] [--md] [--cpu=INT] [-r] [--nc] [-v] [--clean] [--log=FOLDER] [--ignore-error] [--taxon-file]
     mpwt --dat [-f=DIR] [-o=DIR] [--md] [--cpu=INT] [-v]
     mpwt -o=DIR [--md] [--cpu=INT] [-v]
     mpwt --clean [--cpu=INT] [-v]
@@ -21,6 +21,7 @@
     -o=DIR    Output folder path. Will create a output folder in this folder.
     --patho    Will run an inference of Pathologic on the input files.
     --hf    Use with --patho. Run the Hole Filler using Blast.
+    --nc    Turn off loading of Pubmed entries.
     --dat    Will create BioPAX/attribute-value dat files from PGDB.
     --md    Move only the dat files into the output folder.
     --clean    Clean ptools-local folder, before any other operations.
@@ -69,6 +70,7 @@ def run_mpwt():
     pgdb_list = args['--list']
     ignore_error = args['--ignore-error']
     taxon_file = args['--taxon-file']
+    turn_off_citations = args['--nc']
     verbose = args['-v']
     topf = args['topf']
 
@@ -118,6 +120,7 @@ def run_mpwt():
                     patho_log=patho_log,
                     ignore_error=ignore_error,
                     taxon_file=taxon_file,
+                    turn_off_citations=turn_off_citations,
                     verbose=verbose)
 
 
diff --git a/mpwt/mpwt_workflow.py b/mpwt/mpwt_workflow.py
index 10616f1..cdec7a3 100755
--- a/mpwt/mpwt_workflow.py
+++ b/mpwt/mpwt_workflow.py
@@ -25,7 +25,8 @@
 def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None,
                      patho_hole_filler=None, dat_creation=None, dat_extraction=None,
                      size_reduction=None, number_cpu=None, patho_log=None,
-                     ignore_error=None, taxon_file=None, verbose=None):
+                     ignore_error=None, taxon_file=None, turn_off_citations=None,
+                     verbose=None):
     """
     Function managing all the workflow (from the creatin of the input files to the results).
     Use it when you import mpwt in a script.
@@ -72,6 +73,10 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
     if taxon_file and not patho_inference:
         sys.exit('To use --taxon-file/taxon_file, you need to use the --patho/patho_inference argument.')
 
+    #Check if turn_off_citations is used with patho_inference.
+    if turn_off_citations and not patho_inference:
+        sys.exit('To use --nc/turn_off_citations, you need to use the --patho/patho_inference argument.')
+
     # Use the number of cpu given by the user or 1 CPU.
     if number_cpu:
         try:
@@ -82,6 +87,10 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
         number_cpu_to_use = 1
     mpwt_pool = Pool(processes=number_cpu_to_use)
 
+    # Turn off loading of pubmed entries.
+    if turn_off_citations:
+        utils.pubmed_citations(activate_citations=False)
+
     # Check input folder and create input files for PathoLogic.
     if input_folder:
         run_ids = [folder_id for folder_id in next(os.walk(input_folder))[1]]
@@ -198,6 +207,10 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
     mpwt_pool.close()
     mpwt_pool.join()
 
+    # Turn on loading of pubmed entries.
+    if turn_off_citations:
+        utils.pubmed_citations(activate_citations=True)
+
     end_time = time.time()
     times.append(end_time)
     steps.append('mpwt')
diff --git a/mpwt/utils.py b/mpwt/utils.py
index 03ca659..b373e58 100755
--- a/mpwt/utils.py
+++ b/mpwt/utils.py
@@ -330,4 +330,18 @@ def run_create_pathologic_file(multiprocessing_input_data):
 
                 element_file.write('//\n\n')
 
-        
\ No newline at end of file
+
+def pubmed_citations(activate_citations):
+    ptools_init_filepath = find_ptools_path() + '/ptools-init.dat'
+    new_ptools_file = ""
+    with open(ptools_init_filepath, 'r') as ptools_init_file:
+        for line in ptools_init_file.read().split('\n'):
+            if '##download-pubmed-citations' in line:
+                if activate_citations:
+                    line = line.replace('N', 'Y')
+                elif activate_citations == False:
+                    line = line.replace('Y', 'N')
+            new_ptools_file = new_ptools_file + line + '\n'
+
+    with open(ptools_init_filepath, 'w') as ptools_init_file:
+        ptools_init_file.write(new_ptools_file)

From da8a77478b516878c4f5784f5eb4cd01a7aaf315 Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Mon, 18 Nov 2019 17:29:26 +0100
Subject: [PATCH 02/15] Fix issue with variable name in topf function (issue
 #30).

---
 mpwt/utils.py | 111 ++++++++++++++++++++++++++------------------------
 1 file changed, 57 insertions(+), 54 deletions(-)

diff --git a/mpwt/utils.py b/mpwt/utils.py
index b373e58..d42ae70 100755
--- a/mpwt/utils.py
+++ b/mpwt/utils.py
@@ -216,7 +216,7 @@ def run_create_pathologic_file(multiprocessing_input_data):
                         if 'taxon:' in src_dbxref_qualifier:
                             taxon_id = src_dbxref_qualifier.replace('taxon:', '')
                 except KeyError:
-                    logger.info('No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'.format(genbank_folder))
+                    logger.info('No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'.format(input_path))
             if taxon_id:
                 if not os.path.exists(output_folder + '/taxon_id.tsv'):
                     with open(output_folder + '/taxon_id.tsv', 'w') as taxon_id_file:
@@ -228,59 +228,59 @@ def run_create_pathologic_file(multiprocessing_input_data):
                         taxon_writer = csv.writer(taxon_id_file, delimiter='\t')
                         taxon_writer.writerow([input_name, taxon_id])
 
-            for record in SeqIO.parse(input_path, 'genbank'):
-                element_id = record.id
-                records = [record]
-                SeqIO.write(records, output_path + '/' + element_id + '.fasta', 'fasta')
-                with open(output_path + '/' + element_id + '.pf', 'w') as element_file:
-                    element_file.write(';;;;;;;;;;;;;;;;;;;;;;;;;\n')
-                    element_file.write(';; ' + element_id + '\n')
-                    element_file.write(';;;;;;;;;;;;;;;;;;;;;;;;;\n')
-                    for feature in record.features:
-                        if feature.type == 'CDS':
-                            gene_name = None
-                            gene_id = None
-                            if 'locus_tag' in feature.qualifiers:
-                                gene_id = feature.qualifiers['locus_tag'][0]
-                            if 'gene' in feature.qualifiers:
-                                gene_name = feature.qualifiers['gene'][0]
-                            if not gene_id and not gene_name:
-                                logger.critical('No locus_tag and no gene qualifiers in feature of record: ' + record.id)
-                                pass
-                            if gene_id:
-                                element_file.write('ID\t' + gene_id + '\n')
-                            else:
-                                if gene_name:
-                                    element_file.write('ID\t' + gene_name + '\n')
+        for record in SeqIO.parse(input_path, 'genbank'):
+            element_id = record.id
+            records = [record]
+            SeqIO.write(records, output_path + '/' + element_id + '.fasta', 'fasta')
+            with open(output_path + '/' + element_id + '.pf', 'w') as element_file:
+                element_file.write(';;;;;;;;;;;;;;;;;;;;;;;;;\n')
+                element_file.write(';; ' + element_id + '\n')
+                element_file.write(';;;;;;;;;;;;;;;;;;;;;;;;;\n')
+                for feature in record.features:
+                    if feature.type == 'CDS':
+                        gene_name = None
+                        gene_id = None
+                        if 'locus_tag' in feature.qualifiers:
+                            gene_id = feature.qualifiers['locus_tag'][0]
+                        if 'gene' in feature.qualifiers:
+                            gene_name = feature.qualifiers['gene'][0]
+                        if not gene_id and not gene_name:
+                            logger.critical('No locus_tag and no gene qualifiers in feature of record: ' + record.id)
+                            pass
+                        if gene_id:
+                            element_file.write('ID\t' + gene_id + '\n')
+                        else:
                             if gene_name:
-                                element_file.write('NAME\t' + gene_name + '\n')
-                            else:
-                                if gene_id:
-                                    element_file.write('NAME\t' + gene_id + '\n')
-                            element_file.write('STARTBASE\t' + str(feature.location.start+1) + '\n')
-                            element_file.write('ENDBASE\t' + str(feature.location.end) + '\n')
-                            if 'function' in feature.qualifiers:
-                                for function in feature.qualifiers['function']:
-                                    element_file.write('FUNCTION\t' + function + '\n')
-                            if 'EC_number' in feature.qualifiers:
-                                for ec in feature.qualifiers['EC_number']:
-                                    element_file.write('EC\t' + ec + '\n')
-                            if 'go_component' in feature.qualifiers:
-                                for go in feature.qualifiers['go_component']:
-                                    element_file.write('GO\t' + go + '\n')
-                            if 'go_function' in feature.qualifiers:
-                                for go in feature.qualifiers['go_component']:
-                                    element_file.write('GO\t' + go + '\n')
-                            if 'go_process' in feature.qualifiers:
-                                for go in feature.qualifiers['go_component']:
-                                    element_file.write('GO\t' + go + '\n')
-                            element_file.write('PRODUCT-TYPE\tP' + '\n')
+                                element_file.write('ID\t' + gene_name + '\n')
+                        if gene_name:
+                            element_file.write('NAME\t' + gene_name + '\n')
+                        else:
                             if gene_id:
-                                element_file.write('PRODUCT-ID\tprot ' + gene_id + '\n')
-                            else:
-                                if gene_name:
-                                    element_file.write('PRODUCT-ID\tprot ' + gene_name + '\n')
-                            element_file.write('//\n\n')
+                                element_file.write('NAME\t' + gene_id + '\n')
+                        element_file.write('STARTBASE\t' + str(feature.location.start+1) + '\n')
+                        element_file.write('ENDBASE\t' + str(feature.location.end) + '\n')
+                        if 'function' in feature.qualifiers:
+                            for function in feature.qualifiers['function']:
+                                element_file.write('FUNCTION\t' + function + '\n')
+                        if 'EC_number' in feature.qualifiers:
+                            for ec in feature.qualifiers['EC_number']:
+                                element_file.write('EC\t' + ec + '\n')
+                        if 'go_component' in feature.qualifiers:
+                            for go in feature.qualifiers['go_component']:
+                                element_file.write('GO\t' + go + '\n')
+                        if 'go_function' in feature.qualifiers:
+                            for go in feature.qualifiers['go_component']:
+                                element_file.write('GO\t' + go + '\n')
+                        if 'go_process' in feature.qualifiers:
+                            for go in feature.qualifiers['go_component']:
+                                element_file.write('GO\t' + go + '\n')
+                        element_file.write('PRODUCT-TYPE\tP' + '\n')
+                        if gene_id:
+                            element_file.write('PRODUCT-ID\tprot ' + gene_id + '\n')
+                        else:
+                            if gene_name:
+                                element_file.write('PRODUCT-ID\tprot ' + gene_name + '\n')
+                        element_file.write('//\n\n')
 
     elif input_path.endswith('.gff'):
         gff_database = gffutils.create_db(input_path, ':memory:', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)
@@ -339,9 +339,12 @@ def pubmed_citations(activate_citations):
             if '##download-pubmed-citations' in line:
                 if activate_citations:
                     line = line.replace('N', 'Y')
-                elif activate_citations == False:
+                else:
                     line = line.replace('Y', 'N')
-            new_ptools_file = new_ptools_file + line + '\n'
+            if line != '':
+                new_ptools_file = new_ptools_file + line + '\n'
+            else:
+                new_ptools_file = new_ptools_file + line
 
     with open(ptools_init_filepath, 'w') as ptools_init_file:
         ptools_init_file.write(new_ptools_file)

From 23b0815f4c8659f72e69892dd3a7c629ce5abf6e Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Mon, 18 Nov 2019 17:31:17 +0100
Subject: [PATCH 03/15] Add option for operon prediction (issue #33).

---
 mpwt/__main__.py         | 11 +++++++----
 mpwt/mpwt_workflow.py    | 37 ++++++++++++++++++++++---------------
 mpwt/pathologic_input.py |  7 +++++--
 mpwt/pwt_wrapper.py      |  4 ++++
 4 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/mpwt/__main__.py b/mpwt/__main__.py
index f819fa7..6ef12aa 100755
--- a/mpwt/__main__.py
+++ b/mpwt/__main__.py
@@ -7,7 +7,7 @@
 The script takes a folder name as argument.
 
 usage:
-    mpwt -f=DIR [-o=DIR] [--patho] [--hf] [--dat] [--md] [--cpu=INT] [-r] [--nc] [-v] [--clean] [--log=FOLDER] [--ignore-error] [--taxon-file]
+    mpwt -f=DIR [-o=DIR] [--patho] [--hf] [--op] [--nc] [--dat] [--md] [--cpu=INT] [-r] [-v] [--clean] [--log=FOLDER] [--ignore-error] [--taxon-file]
     mpwt --dat [-f=DIR] [-o=DIR] [--md] [--cpu=INT] [-v]
     mpwt -o=DIR [--md] [--cpu=INT] [-v]
     mpwt --clean [--cpu=INT] [-v]
@@ -21,7 +21,8 @@
     -o=DIR    Output folder path. Will create a output folder in this folder.
     --patho    Will run an inference of Pathologic on the input files.
     --hf    Use with --patho. Run the Hole Filler using Blast.
-    --nc    Turn off loading of Pubmed entries.
+    --op    Use with --patho. Run the Operon predictor of Pathway-Tools.
+    --nc    Use with --patho. Turn off loading of Pubmed entries.
     --dat    Will create BioPAX/attribute-value dat files from PGDB.
     --md    Move only the dat files into the output folder.
     --clean    Clean ptools-local folder, before any other operations.
@@ -61,6 +62,8 @@ def run_mpwt():
     output_folder = args['-o']
     patho_inference = args['--patho']
     patho_hole_filler = args['--hf']
+    patho_operon_predictor = args['--op']
+    patho_citations = args['--nc']
     dat_creation = args['--dat']
     move_dat = args['--md']
     size_reduction = args['-r']
@@ -70,7 +73,6 @@ def run_mpwt():
     pgdb_list = args['--list']
     ignore_error = args['--ignore-error']
     taxon_file = args['--taxon-file']
-    turn_off_citations = args['--nc']
     verbose = args['-v']
     topf = args['topf']
 
@@ -113,6 +115,8 @@ def run_mpwt():
                     output_folder=output_folder,
                     patho_inference=patho_inference,
                     patho_hole_filler=patho_hole_filler,
+                    patho_operon_predictor=patho_operon_predictor,
+                    patho_citations=patho_citations,
                     dat_creation=dat_creation,
                     dat_extraction=move_dat,
                     size_reduction=size_reduction,
@@ -120,7 +124,6 @@ def run_mpwt():
                     patho_log=patho_log,
                     ignore_error=ignore_error,
                     taxon_file=taxon_file,
-                    turn_off_citations=turn_off_citations,
                     verbose=verbose)
 
 
diff --git a/mpwt/mpwt_workflow.py b/mpwt/mpwt_workflow.py
index cdec7a3..d5a6996 100755
--- a/mpwt/mpwt_workflow.py
+++ b/mpwt/mpwt_workflow.py
@@ -23,10 +23,10 @@
 
 
 def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None,
-                     patho_hole_filler=None, dat_creation=None, dat_extraction=None,
-                     size_reduction=None, number_cpu=None, patho_log=None,
-                     ignore_error=None, taxon_file=None, turn_off_citations=None,
-                     verbose=None):
+                     patho_hole_filler=None, patho_operon_predictor=None, patho_citations=None,
+                     dat_creation=None, dat_extraction=None, size_reduction=None,
+                     number_cpu=None, patho_log=None, ignore_error=None,
+                     taxon_file=None, turn_off_citations=None, verbose=None):
     """
     Function managing all the workflow (from the creatin of the input files to the results).
     Use it when you import mpwt in a script.
@@ -73,9 +73,13 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
     if taxon_file and not patho_inference:
         sys.exit('To use --taxon-file/taxon_file, you need to use the --patho/patho_inference argument.')
 
-    #Check if turn_off_citations is used with patho_inference.
-    if turn_off_citations and not patho_inference:
-        sys.exit('To use --nc/turn_off_citations, you need to use the --patho/patho_inference argument.')
+    #Check if patho_operon_predictor is used with patho_inference.
+    if patho_operon_predictor and not patho_inference:
+        sys.exit('To use --op/patho_operon_predictor, you need to use the --patho/patho_inference argument.')
+
+    #Check if patho_citations is used with patho_inference.
+    if patho_citations and not patho_inference:
+        sys.exit('To use --nc/patho_citations, you need to use the --patho/patho_inference argument.')
 
     # Use the number of cpu given by the user or 1 CPU.
     if number_cpu:
@@ -88,7 +92,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
     mpwt_pool = Pool(processes=number_cpu_to_use)
 
     # Turn off loading of pubmed entries.
-    if turn_off_citations:
+    if patho_citations:
         utils.pubmed_citations(activate_citations=False)
 
     # Check input folder and create input files for PathoLogic.
@@ -104,8 +108,9 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
         if run_patho_dat_ids:
             # Create the list containing all the data used by the multiprocessing call.
             multiprocess_inputs = create_mpwt_input(run_ids=run_patho_dat_ids, input_folder=input_folder, pgdbs_folder_path=pgdbs_folder_path,
-                                                    patho_hole_filler=patho_hole_filler, dat_extraction=dat_extraction, output_folder=output_folder,
-                                                    size_reduction=size_reduction, only_dat_creation=None, taxon_file=taxon_file)
+                                                    patho_hole_filler=patho_hole_filler, patho_operon_predictor=patho_operon_predictor,
+                                                    dat_extraction=dat_extraction, output_folder=output_folder, size_reduction=size_reduction,
+                                                    only_dat_creation=None, taxon_file=taxon_file)
 
             logger.info('~~~~~~~~~~Creation of input data from Genbank/GFF/PF~~~~~~~~~~')
             mpwt_pool.map(pwt_input_files, multiprocess_inputs)
@@ -149,8 +154,9 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
         dat_run_ids = create_only_dat_lisp(pgdbs_folder_path, tmp_folder)
 
         multiprocess_inputs = create_mpwt_input(run_ids=dat_run_ids, input_folder=tmp_folder, pgdbs_folder_path=pgdbs_folder_path,
-                                                patho_hole_filler=patho_hole_filler, dat_extraction=dat_extraction, output_folder=output_folder,
-                                                size_reduction=size_reduction, only_dat_creation=only_dat_creation, taxon_file=taxon_file)
+                                                patho_hole_filler=patho_hole_filler, patho_operon_predictor=patho_operon_predictor,
+                                                dat_extraction=dat_extraction, output_folder=output_folder, size_reduction=size_reduction,
+                                                only_dat_creation=only_dat_creation, taxon_file=taxon_file)
     # Add species that have data in PGDB but are not present in output folder.
     # Or if ignore_error has been used, select only PathoLogic build that have succeed + species in input with PGDB and not in output.
     if input_folder:
@@ -163,8 +169,9 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
             for run_dat_id in run_dat_ids:
                 create_dat_creation_script(run_dat_id, input_folder + "/" + run_dat_id + "/" + "dat_creation.lisp")
             multiprocess_dat_inputs = create_mpwt_input(run_ids=run_dat_ids, input_folder=input_folder, pgdbs_folder_path=pgdbs_folder_path,
-                                                        patho_hole_filler=patho_hole_filler, dat_extraction=dat_extraction, output_folder=output_folder,
-                                                        size_reduction=size_reduction, only_dat_creation=None, taxon_file=taxon_file)
+                                                        patho_hole_filler=patho_hole_filler, patho_operon_predictor=patho_operon_predictor,
+                                                        dat_extraction=dat_extraction, output_folder=output_folder, size_reduction=size_reduction,
+                                                        only_dat_creation=None, taxon_file=taxon_file)
             multiprocess_inputs.extend(multiprocess_dat_inputs)
 
     # Create BioPAX/attributes-values dat files.
@@ -208,7 +215,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
     mpwt_pool.join()
 
     # Turn on loading of pubmed entries.
-    if turn_off_citations:
+    if patho_citations:
         utils.pubmed_citations(activate_citations=True)
 
     end_time = time.time()
diff --git a/mpwt/pathologic_input.py b/mpwt/pathologic_input.py
index 2ad479f..ccca8e5 100755
--- a/mpwt/pathologic_input.py
+++ b/mpwt/pathologic_input.py
@@ -426,8 +426,10 @@ def pwt_input_files(multiprocess_input):
 
 
 def create_mpwt_input(run_ids, input_folder, pgdbs_folder_path,
-                      patho_hole_filler=None, dat_extraction=None, output_folder=None,
-                      size_reduction=None, only_dat_creation=None, taxon_file=None):
+                      patho_hole_filler=None, patho_operon_predictor=None,
+                      dat_extraction=None, output_folder=None,
+                      size_reduction=None, only_dat_creation=None,
+                      taxon_file=None):
     """
     Create input list for all multiprocess function, containing one lsit for each input subfolder.
     All arguments are also stored.
@@ -456,6 +458,7 @@ def create_mpwt_input(run_ids, input_folder, pgdbs_folder_path,
             multiprocess_input['pgdb_folders'] = pgdb_id_folders
         multiprocess_input['species_input_folder_path'] = input_folder_path
         multiprocess_input['patho_hole_filler'] = patho_hole_filler
+        multiprocess_input['patho_operon_predictor'] = patho_operon_predictor
         multiprocess_input['dat_extraction'] = dat_extraction
         multiprocess_input['output_folder'] = output_folder
         multiprocess_input['size_reduction'] = size_reduction
diff --git a/mpwt/pwt_wrapper.py b/mpwt/pwt_wrapper.py
index 4485df5..89ee818 100755
--- a/mpwt/pwt_wrapper.py
+++ b/mpwt/pwt_wrapper.py
@@ -65,6 +65,7 @@ def run_pwt(multiprocess_input):
     """
     species_input_folder_path = multiprocess_input['species_input_folder_path']
     patho_hole_filler = multiprocess_input['patho_hole_filler']
+    patho_operon_predictor = multiprocess_input['patho_operon_predictor']
 
     cmd_options = ['-no-web-cel-overview', '-no-cel-overview', '-no-patch-download', '-disable-metadata-saving', '-nologfile']
 
@@ -73,6 +74,9 @@ def run_pwt(multiprocess_input):
     if patho_hole_filler:
         cmd_pwt.append('-hole-filler')
 
+    if patho_operon_predictor:
+        cmd_pwt.append('-operon-predictor')
+
     logger.info(' '.join(cmd_pwt))
 
     error_status = None

From fddf6c361c5b468c7b1940a15fba2a1592dc7c08 Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Mon, 18 Nov 2019 17:53:39 +0100
Subject: [PATCH 04/15] Update Readme with Operon Predictor option (issue #33).
 Update Readme with no loading of PubMed citations (issue #34).

---
 README.rst | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/README.rst b/README.rst
index 475c0f3..60d5c27 100755
--- a/README.rst
+++ b/README.rst
@@ -262,7 +262,7 @@ mpwt can be used with the command line:
 
 .. code:: sh
 
-    mpwt -f path/to/folder/input [-o path/to/folder/output] [--patho] [--hf] [--dat] [--md] [--cpu INT] [-r] [--clean] [--log path/to/folder/log] [--ignore-error] [-v]
+    mpwt -f path/to/folder/input [-o path/to/folder/output] [--patho] [--hf] [--op] [--nc] [--dat] [--md] [--cpu INT] [-r] [--clean] [--log path/to/folder/log] [--ignore-error] [-v]
 
 Optional argument are identified by [].
 
@@ -279,6 +279,8 @@ mpwt can be used in a python script with an import:
 			  output_folder=folder_output,
 			  patho_inference=optional_boolean,
 			  patho_hole_filler=optional_boolean,
+              patho_operon_predictor=optional_boolean,
+              patho_citations=optional_boolean,
 			  dat_creation=optional_boolean,
 			  dat_extraction=optional_boolean,
 			  size_reduction=optional_boolean,
@@ -291,13 +293,17 @@ mpwt can be used in a python script with an import:
 +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
 | Command line argument   | Python argument                                | description                                                             |
 +=========================+================================================+=========================================================================+
-|          -f             | input_folder(string: folder pathname)          | input folder as described in Input data                                 |
+|          -f             | input_folder(string: folder pathname)          | Input folder as described in Input data                                 |
 +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
-|          -o             | output_folder(string: folder pathname)         | output folder containing PGDB data or dat files (see --dat arguments)   |
+|          -o             | output_folder(string: folder pathname)         | Output folder containing PGDB data or dat files (see --dat arguments)   |
 +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
-|          --patho        | patho_inference(boolean)                       | launch PathoLogic inference on input folder                             |
+|          --patho        | patho_inference(boolean)                       | Launch PathoLogic inference on input folder                             |
 +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
-|          --hf           | patho_hole_filler(boolean)                     | launch PathoLogic Hole Filler with Blast                                |
+|          --hf           | patho_hole_filler(boolean)                     | Launch PathoLogic Hole Filler with Blast                                |
++-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
+|          --op           | patho_operon_predictor(boolean)                | Launch PathoLogic Operon Predictor                                      |
++-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
+|          --nc           | patho_citations(boolean)                       | Launch PathoLogic without loading PubMed citations                      |
 +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
 |          --dat          | dat_creation(boolean)                          | Create BioPAX/attribute-value dat files                                 |
 +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
@@ -350,13 +356,13 @@ Create PGDBs of studied organisms inside ptools-local:
         mpwt.multiprocess_pwt(input_folder='path/to/folder/input',
                 patho_inference=True)
 
-Create PGDBs of studied organisms inside ptools-local with the Hole-Filler:
+Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon Predictor and without loading PubMed citations:
 
 ..
 
     .. code:: sh
 
-        mpwt -f path/to/folder/input --patho --hf --log path/to/folder/log
+        mpwt -f path/to/folder/input --patho --hf --op --nc --log path/to/folder/log
 
     .. code:: python
 
@@ -364,6 +370,8 @@ Create PGDBs of studied organisms inside ptools-local with the Hole-Filler:
         mpwt.multiprocess_pwt(input_folder='path/to/folder/input',
                 patho_inference=True,
                 patho_hole_filler=True,
+                patho_operon_predictor=True,
+                patho_citations=True,
                 patho_log='path/to/folder/log')
 
 Create PGDBs of studied organisms inside ptools-local and create dat files:

From 4287fd0c0bd0c0950dfa9b466364201638e07466 Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Mon, 18 Nov 2019 18:12:05 +0100
Subject: [PATCH 05/15] Fix issue with result files from Hole Filler and Operon
 Predictor (issue #33).

---
 mpwt/pwt_wrapper.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/mpwt/pwt_wrapper.py b/mpwt/pwt_wrapper.py
index 89ee818..f9f9467 100755
--- a/mpwt/pwt_wrapper.py
+++ b/mpwt/pwt_wrapper.py
@@ -194,7 +194,10 @@ def run_move_pgdb(move_data):
             for pgdb_file in os.listdir(pgdb_tmp_folder_path):
                 pgdb_file_pathname = pgdb_tmp_folder_path + '/' + pgdb_file
                 if '.dat' not in pgdb_file:
-                    os.remove(pgdb_file_pathname)
+                    if os.path.isfile(pgdb_file):
+                        os.remove(pgdb_file_pathname)
+                    elif os.path.isdir(pgdb_file):
+                        shutil.rmtree(pgdb_file_pathname)
         shutil.make_archive(output_folder + '/' + pgdb_folder_dbname, 'zip', pgdb_tmp_folder_path)
         shutil.rmtree(pgdb_folder_path)
     else:
@@ -202,4 +205,7 @@ def run_move_pgdb(move_data):
         if dat_extraction:
             for pgdb_file in os.listdir(output_species):
                 if '.dat' not in pgdb_file:
-                    os.remove(output_species+'/'+pgdb_file)
+                    if os.path.isfile(output_species+'/'+pgdb_file):
+                        os.remove(output_species+'/'+pgdb_file)
+                    elif os.path.isdir(output_species+'/'+pgdb_file):
+                        shutil.rmtree(output_species+'/'+pgdb_file)

From a7c944e2aa73d50d244de23c6ce20156f8057772 Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Wed, 20 Nov 2019 15:04:33 +0100
Subject: [PATCH 06/15] Update function comments with new arguments.

---
 mpwt/mpwt_workflow.py    | 12 +++++++-----
 mpwt/pathologic_input.py |  1 +
 mpwt/utils.py            | 19 +++++++++++++++++++
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/mpwt/mpwt_workflow.py b/mpwt/mpwt_workflow.py
index d5a6996..0192073 100755
--- a/mpwt/mpwt_workflow.py
+++ b/mpwt/mpwt_workflow.py
@@ -34,11 +34,13 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
     Args:
         input_folder (str): pathname to input folder
         output_folder (str): pathname to output folder
-        patho_inference (bool): pathologic boolean (True/False)
-        patho_hole_filler (bool): pathologic hole filler boolean (True/False)
-        dat_creation (bool): BioPAX/attributes-values files creation boolean (True/False)
-        dat_extraction (bool): BioPAX/attributes-values files extraction boolean (True/False)
-        size_reduction (bool): Delete ptools-local data at the end boolean (True/False)
+        patho_inference (bool): PathoLogic inference (True/False)
+        patho_hole_filler (bool): PathoLogic hole filler (True/False)
+        patho_operon_predictor (bool): PathoLogic operon predictor (True/False)
+        patho_citations (bool): turning off loading of PubMed citations (True/False)
+        dat_creation (bool): BioPAX/attributes-values files creation (True/False)
+        dat_extraction (bool): BioPAX/attributes-values files extraction (True/False)
+        size_reduction (bool): delete ptools-local data at the end (True/False)
         number_cpu (int): number of CPU used (default=1)
         patho_log (str): pathname to mpwt log folder
         verbose (bool): verbose argument
diff --git a/mpwt/pathologic_input.py b/mpwt/pathologic_input.py
index ccca8e5..305a757 100755
--- a/mpwt/pathologic_input.py
+++ b/mpwt/pathologic_input.py
@@ -439,6 +439,7 @@ def create_mpwt_input(run_ids, input_folder, pgdbs_folder_path,
         input_folder (str): pathname to input folder
         pgdbs_folder_path (str): pathname to species PGDB in ptools-local
         patho_hole_filler (bool): PathoLogic Hole Filler argument
+        patho_operon_predictor (bool): PathoLogic Operon predictor argument
         dat_extraction (bool): BioPAX/attribute-values file extraction argument
         output_folder (str): pathname to output folder
         size_reduction (bool): ptools-local PGDB deletion after processing argument
diff --git a/mpwt/utils.py b/mpwt/utils.py
index d42ae70..66522f9 100755
--- a/mpwt/utils.py
+++ b/mpwt/utils.py
@@ -168,6 +168,14 @@ def permission_change(folder_pathname):
 
 
 def create_pathologic_file(input_folder, output_folder, number_cpu=None):
+    """
+    Create PathoLogic file from Genbank or GFF files.
+
+    Args:
+        input_folder (str): pathname to the folder containing Genbanks or GFFs
+        output_folder (str): pathname to the output folder containing the PathoLogic files
+        number_cpu (str): number of CPU
+    """
     if number_cpu:
         number_cpu_to_use = int(number_cpu)
     else:
@@ -199,6 +207,12 @@ def create_pathologic_file(input_folder, output_folder, number_cpu=None):
 
 
 def run_create_pathologic_file(multiprocessing_input_data):
+    """
+    Create PathoLogic files from a Genbank or a GFF file.
+
+    Args:
+        multiprocess_input (dictionary): contains multiprocess input (input folder, output_path, output folder and input_name)
+    """
     input_path = multiprocessing_input_data['input_path']
     output_folder = multiprocessing_input_data['output_folder']
     output_path = multiprocessing_input_data['output_path']
@@ -332,6 +346,11 @@ def run_create_pathologic_file(multiprocessing_input_data):
 
 
 def pubmed_citations(activate_citations):
+    """
+    Activate or deactivate loading of PubMed citations.
+
+    TODO: update this function with the argument from the new version of Pathway Tools
+    """
     ptools_init_filepath = find_ptools_path() + '/ptools-init.dat'
     new_ptools_file = ""
     with open(ptools_init_filepath, 'r') as ptools_init_file:

From 24140ee7d5f4d5dc0343b20e89df93f4c2ef12df Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Fri, 22 Nov 2019 16:42:48 +0100
Subject: [PATCH 07/15] Fix issue with topf and PF as input (issue #30). Add
 test for topf (issue #30).

---
 mpwt/__init__.py                              |  2 +-
 mpwt/__main__.py                              |  2 +-
 mpwt/utils.py                                 | 77 +++++++++++++------
 .../fatty_acid_beta_oxydation_I_gff.gff       | 32 ++++----
 ...pwt_test.py => test_mpwt_pathway_tools.py} |  5 +-
 5 files changed, 76 insertions(+), 42 deletions(-)
 rename test/{mpwt_test.py => test_mpwt_pathway_tools.py} (93%)

diff --git a/mpwt/__init__.py b/mpwt/__init__.py
index 08e2354..85d1f35 100755
--- a/mpwt/__init__.py
+++ b/mpwt/__init__.py
@@ -1,3 +1,3 @@
 from mpwt.pwt_wrapper import run_pwt, run_pwt_dat
 from mpwt.mpwt_workflow import multiprocess_pwt
-from mpwt.utils import cleaning, cleaning_input, find_ptools_path, list_pgdb, pubmed_citations, remove_pgdbs
+from mpwt.utils import cleaning, cleaning_input, create_pathologic_file, find_ptools_path, list_pgdb, pubmed_citations, remove_pgdbs
diff --git a/mpwt/__main__.py b/mpwt/__main__.py
index 6ef12aa..bcd3728 100755
--- a/mpwt/__main__.py
+++ b/mpwt/__main__.py
@@ -34,7 +34,7 @@
     --ignore-error     Ignore errors (PathoLogic and dat creation) and continue for successful builds.
     --taxon-file     For the use of the taxon_id.tsv file to find the taxon ID.
     -v     Verbose.
-    topf     Will convert Genbank file into PathoLogic Format file.
+    topf     Will convert Genbank and/or GFF files into PathoLogic Format file.
 
 """
 
diff --git a/mpwt/utils.py b/mpwt/utils.py
index 66522f9..7e4b682 100755
--- a/mpwt/utils.py
+++ b/mpwt/utils.py
@@ -185,20 +185,41 @@ def create_pathologic_file(input_folder, output_folder, number_cpu=None):
 
     mpwt_pool = Pool(processes=number_cpu_to_use)
 
-    for input_name in os.listdir(input_folder):
+    input_names = os.listdir(input_folder)
+
+    if 'taxon_id.tsv' in input_names:
+        taxon_ids = {}
+        input_names.remove('taxon_id.tsv')
+        with open(input_folder + '/taxon_id.tsv') as taxon_file:
+            for row in csv.reader(taxon_file, delimiter='\t'):
+                taxon_ids[row[0]] = row[1]
+    else:
+        taxon_ids = None
+
+    for input_name in input_names:
         input_path_gbk = input_folder + '/' + input_name + '/' + input_name + '.gbk'
         input_path_gff = input_folder + '/' + input_name + '/' + input_name + '.gff'
         if os.path.exists(input_path_gbk):
             input_path = input_path_gbk
         elif os.path.exists(input_path_gff):
             input_path = input_path_gff
+        elif all([True for species_file in os.listdir(input_folder + '/' + input_name + '/') if '.pf' in species_file or '.fasta' in species_file]):
+            input_path = input_folder + '/' + input_name + '/'
         else:
             sys.exit('No .gff or .gbk file in ' + input_folder + '/' + input_name)
+
         output_path = output_folder + '/' + input_name
-        if not os.path.exists(output_path):
-            os.makedirs(output_path)
-        multiprocessing_input_data.append({'input_path': input_path, 'output_path': output_path,
-                                            'output_folder': output_folder, 'input_name': input_name})
+
+        if not os.path.exists(output_folder):
+            os.makedirs(output_folder)
+
+        multiprocessing_dict = {'input_path': input_path, 'output_path': output_path,
+                                'output_folder': output_folder, 'input_name': input_name}
+        if taxon_ids:
+            if input_name in taxon_ids:
+                multiprocessing_dict['taxon_id'] = taxon_ids[input_name]
+
+        multiprocessing_input_data.append(multiprocessing_dict)
 
     mpwt_pool.map(run_create_pathologic_file, multiprocessing_input_data)
 
@@ -206,6 +227,18 @@ def create_pathologic_file(input_folder, output_folder, number_cpu=None):
     mpwt_pool.join()
 
 
+def write_taxon_id_file(input_name, taxon_id, output_folder):
+    if not os.path.exists(output_folder + '/taxon_id.tsv'):
+        with open(output_folder + '/taxon_id.tsv', 'w') as taxon_id_file:
+            taxon_writer = csv.writer(taxon_id_file, delimiter='\t')
+            taxon_writer.writerow(['species', 'taxon_id'])
+            taxon_writer.writerow([input_name, taxon_id])
+    else:
+        with open(output_folder + '/taxon_id.tsv', 'a') as taxon_id_file:
+            taxon_writer = csv.writer(taxon_id_file, delimiter='\t')
+            taxon_writer.writerow([input_name, taxon_id])
+
+
 def run_create_pathologic_file(multiprocessing_input_data):
     """
     Create PathoLogic files from a Genbank or a GFF file.
@@ -220,6 +253,10 @@ def run_create_pathologic_file(multiprocessing_input_data):
     taxon_id = None
     # Add taxon ID in taxon_id.tsv if available.
     if input_path.endswith('.gbk'):
+
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+
         with open(input_path, "r") as gbk:
             first_seq_record = next(SeqIO.parse(gbk, "genbank"))
             src_features = [feature for feature in first_seq_record.features if feature.type == "source"]
@@ -232,15 +269,7 @@ def run_create_pathologic_file(multiprocessing_input_data):
                 except KeyError:
                     logger.info('No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'.format(input_path))
             if taxon_id:
-                if not os.path.exists(output_folder + '/taxon_id.tsv'):
-                    with open(output_folder + '/taxon_id.tsv', 'w') as taxon_id_file:
-                        taxon_writer = csv.writer(taxon_id_file, delimiter='\t')
-                        taxon_writer.writerow(['species', 'taxon_id'])
-                        taxon_writer.writerow([input_name, taxon_id])
-                else:
-                    with open(output_folder + '/taxon_id.tsv', 'a') as taxon_id_file:
-                        taxon_writer = csv.writer(taxon_id_file, delimiter='\t')
-                        taxon_writer.writerow([input_name, taxon_id])
+                write_taxon_id_file(input_name, taxon_id, output_folder)
 
         for record in SeqIO.parse(input_path, 'genbank'):
             element_id = record.id
@@ -297,6 +326,10 @@ def run_create_pathologic_file(multiprocessing_input_data):
                         element_file.write('//\n\n')
 
     elif input_path.endswith('.gff'):
+
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+
         gff_database = gffutils.create_db(input_path, ':memory:', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)
         regions = list(set([region.chrom for region in gff_database.features_of_type('region')]))
         try:
@@ -308,15 +341,8 @@ def run_create_pathologic_file(multiprocessing_input_data):
                 if 'taxon' in dbxref:
                     taxon_id = dbxref.replace('taxon:', '')
         if taxon_id:
-            if not os.path.exists(output_folder + '/taxon_id.tsv'):
-                with open(output_folder + '/taxon_id.tsv', 'w') as taxon_id_file:
-                    taxon_writer = csv.writer(taxon_id_file, delimiter='\t')
-                    taxon_writer.writerow(['species', 'taxon_id'])
-                    taxon_writer.writerow([input_name, taxon_id])
-            else:
-                with open(output_folder + '/taxon_id.tsv', 'a') as taxon_id_file:
-                    taxon_writer = csv.writer(taxon_id_file, delimiter='\t')
-                    taxon_writer.writerow([input_name, taxon_id])
+            write_taxon_id_file(input_name, taxon_id, output_folder)
+
         for record in SeqIO.parse(input_path.replace('.gff', '.fasta'), 'fasta'):
             output_fasta = output_path + '/' + record.id + '.fasta'
             SeqIO.write(record, output_fasta, 'fasta')
@@ -344,6 +370,11 @@ def run_create_pathologic_file(multiprocessing_input_data):
 
                 element_file.write('//\n\n')
 
+    elif all([True for species_file in os.listdir(input_path) if '.pf' in species_file or '.fasta' in species_file]):
+        taxon_id = multiprocessing_input_data['taxon_id']
+        write_taxon_id_file(input_name, taxon_id, output_folder)
+        shutil.copytree(input_path, output_path)
+
 
 def pubmed_citations(activate_citations):
     """
diff --git a/test/test/fatty_acid_beta_oxydation_I_gff/fatty_acid_beta_oxydation_I_gff.gff b/test/test/fatty_acid_beta_oxydation_I_gff/fatty_acid_beta_oxydation_I_gff.gff
index 03de3a4..6c6bc6a 100755
--- a/test/test/fatty_acid_beta_oxydation_I_gff/fatty_acid_beta_oxydation_I_gff.gff
+++ b/test/test/fatty_acid_beta_oxydation_I_gff/fatty_acid_beta_oxydation_I_gff.gff
@@ -3,20 +3,20 @@
 #!processor NCBI annotwriter
 #!genome-build ASM584v2
 #!genome-build-accession NCBI_Assembly:GCF_000005845.2
-##sequence-region NC_000913.3 1 12642
+##sequence-region NC_000913_3 1 12642
 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=511145
-NC_000913.3	RefSeq	region	1	12642	.	+	.	ID=id0;Dbxref=taxon:511145;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;strain=K-12;substrain=MG1655
-NC_000913.3	RefSeq	gene	1	2445	.	-	.	ID=gene226;Dbxref=ASAP:ABE-0000743,ECOCYC:G6105,EcoGene:EG13145,GeneID:949007;Name=fadE;gbkey=Gene;gene=fadE;gene_biotype=protein_coding;gene_synonym=ECK0222,yafH;locus_tag=b0221
-NC_000913.3	RefSeq	CDS	1	2445	.	-	0	ID=cds216;Parent=gene226;Dbxref=UniProtKB/Swiss-Prot:Q47146,Genbank:NP_414756.2,ASAP:ABE-0000743,ECOCYC:G6105,EcoGene:EG13145,GeneID:949007;Name=NP_414756.2;gbkey=CDS;gene=fadE;orig_transcript_id=gnl|b0221|mrna.b0221;product=acyl-CoA dehydrogenase;protein_id=NP_414756.2;transl_table=11
-NC_000913.3	RefSeq	gene	2446	4146	.	+	.	ID=gene1781;Dbxref=ASAP:ABE-0005676,ECOCYC:EG12357,EcoGene:EG12357,GeneID:946213;Name=fadK;gbkey=Gene;gene=fadK;gene_biotype=protein_coding;gene_synonym=ECK1699,ydiD;locus_tag=b1701
-NC_000913.3	RefSeq	CDS	2446	4146	.	+	0	ID=cds1721;Parent=gene1781;Dbxref=UniProtKB/Swiss-Prot:P38135,Genbank:NP_416216.5,ASAP:ABE-0005676,ECOCYC:EG12357,EcoGene:EG12357,GeneID:946213;Name=NP_416216.5;gbkey=CDS;gene=fadK;orig_transcript_id=gnl|b1701|mrna.b1701;product=short chain acyl-CoA synthetase;protein_id=NP_416216.5;transl_table=11
-NC_000913.3	RefSeq	gene	4147	5832	.	-	.	ID=gene1887;Dbxref=ASAP:ABE-0006005,ECOCYC:EG11530,EcoGene:EG11530,GeneID:946327;Name=fadD;gbkey=Gene;gene=fadD;gene_biotype=protein_coding;gene_synonym=ECK1803,oldD;locus_tag=b1805
-NC_000913.3	RefSeq	CDS	4147	5832	.	-	0	ID=cds1827;Parent=gene1887;Dbxref=UniProtKB/Swiss-Prot:P69451,Genbank:NP_416319.1,ASAP:ABE-0006005,ECOCYC:EG11530,EcoGene:EG11530,GeneID:946327;Name=NP_416319.1;gbkey=CDS;gene=fadD;orig_transcript_id=gnl|b1805|mrna.b1805;product=fatty acyl-CoA synthetase;protein_id=NP_416319.1;transl_table=11
-NC_000913.3	RefSeq	gene	5833	7977	.	-	.	ID=gene2441;Dbxref=ASAP:ABE-0007723,ECOCYC:G7212,EcoGene:EG14127,GeneID:949097;Name=fadJ;gbkey=Gene;gene=fadJ;gene_biotype=protein_coding;gene_synonym=ECK2335,yfcX;locus_tag=b2341
-NC_000913.3	RefSeq	CDS	5833	7977	.	-	0	ID=cds2356;Parent=gene2441;Dbxref=UniProtKB/Swiss-Prot:P77399,Genbank:NP_416843.1,ASAP:ABE-0007723,ECOCYC:G7212,EcoGene:EG14127,GeneID:949097;Name=NP_416843.1;gbkey=CDS;gene=fadJ;orig_transcript_id=gnl|b2341|mrna.b2341;product=3-hydroxyacyl-CoA dehydrogenase FadJ;protein_id=NP_416843.1;transl_table=11
-NC_000913.3	RefSeq	gene	7978	9288	.	-	.	ID=gene2442;Dbxref=ASAP:ABE-0007725,ECOCYC:G7213,EcoGene:EG14128,GeneID:948823;Name=fadI;gbkey=Gene;gene=fadI;gene_biotype=protein_coding;gene_synonym=ECK2336,yfcY;locus_tag=b2342
-NC_000913.3	RefSeq	CDS	7978	9288	.	-	0	ID=cds2357;Parent=gene2442;Dbxref=UniProtKB/Swiss-Prot:P76503,Genbank:NP_416844.1,ASAP:ABE-0007725,ECOCYC:G7213,EcoGene:EG14128,GeneID:948823;Name=NP_416844.1;gbkey=CDS;gene=fadI;orig_transcript_id=gnl|b2342|mrna.b2342;product=3-ketoacyl-CoA thiolase FadI;protein_id=NP_416844.1;transl_table=11
-NC_000913.3	RefSeq	gene	9289	10452	.	-	.	ID=gene3987;Dbxref=ASAP:ABE-0012562,ECOCYC:EG10278,EcoGene:EG10278,GeneID:948324;Name=fadA;gbkey=Gene;gene=fadA;gene_biotype=protein_coding;gene_synonym=ECK3837,oldA;locus_tag=b3845
-NC_000913.3	RefSeq	CDS	9289	10452	.	-	0	ID=cds3816;Parent=gene3987;Dbxref=UniProtKB/Swiss-Prot:P21151,Genbank:YP_026272.1,ASAP:ABE-0012562,ECOCYC:EG10278,EcoGene:EG10278,GeneID:948324;Name=YP_026272.1;gbkey=CDS;gene=fadA;orig_transcript_id=gnl|b3845|mrna.b3845;product=3-ketoacyl-CoA thiolase;protein_id=YP_026272.1;transl_table=11
-NC_000913.3	RefSeq	gene	10453	12642	.	-	.	ID=gene3988;Dbxref=ASAP:ABE-0012564,ECOCYC:EG10279,EcoGene:EG10279,GeneID:948336;Name=fadB;gbkey=Gene;gene=fadB;gene_biotype=protein_coding;gene_synonym=ECK3838,oldB;locus_tag=b3846
-NC_000913.3	RefSeq	CDS	10453	12642	.	-	0	ID=cds3817;Parent=gene3988;Dbxref=UniProtKB/Swiss-Prot:P21177,Genbank:NP_418288.1,ASAP:ABE-0012564,ECOCYC:EG10279,EcoGene:EG10279,GeneID:948336;Name=NP_418288.1;gbkey=CDS;gene=fadB;orig_transcript_id=gnl|b3846|mrna.b3846;product=dodecenoyl-CoA delta-isomerase%2C enoyl-CoA hydratase%2C 3-hydroxybutyryl-CoA epimerase%2C 3-hydroxyacyl-CoA dehydrogenase;protein_id=NP_418288.1;transl_table=11
+NC_000913_3	RefSeq	region	1	12642	.	+	.	ID=id0;Dbxref=taxon:511145;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;strain=K-12;substrain=MG1655
+NC_000913_3	RefSeq	gene	1	2445	.	-	.	ID=gene226;Dbxref=ASAP:ABE-0000743,ECOCYC:G6105,EcoGene:EG13145,GeneID:949007;Name=fadE;gbkey=Gene;gene=fadE;gene_biotype=protein_coding;gene_synonym=ECK0222,yafH;locus_tag=b0221
+NC_000913_3	RefSeq	CDS	1	2445	.	-	0	ID=cds216;Parent=gene226;Dbxref=UniProtKB/Swiss-Prot:Q47146,Genbank:NP_414756.2,ASAP:ABE-0000743,ECOCYC:G6105,EcoGene:EG13145,GeneID:949007;Name=NP_414756.2;gbkey=CDS;gene=fadE;orig_transcript_id=gnl|b0221|mrna.b0221;product=acyl-CoA dehydrogenase;protein_id=NP_414756.2;transl_table=11
+NC_000913_3	RefSeq	gene	2446	4146	.	+	.	ID=gene1781;Dbxref=ASAP:ABE-0005676,ECOCYC:EG12357,EcoGene:EG12357,GeneID:946213;Name=fadK;gbkey=Gene;gene=fadK;gene_biotype=protein_coding;gene_synonym=ECK1699,ydiD;locus_tag=b1701
+NC_000913_3	RefSeq	CDS	2446	4146	.	+	0	ID=cds1721;Parent=gene1781;Dbxref=UniProtKB/Swiss-Prot:P38135,Genbank:NP_416216.5,ASAP:ABE-0005676,ECOCYC:EG12357,EcoGene:EG12357,GeneID:946213;Name=NP_416216.5;gbkey=CDS;gene=fadK;orig_transcript_id=gnl|b1701|mrna.b1701;product=short chain acyl-CoA synthetase;protein_id=NP_416216.5;transl_table=11
+NC_000913_3	RefSeq	gene	4147	5832	.	-	.	ID=gene1887;Dbxref=ASAP:ABE-0006005,ECOCYC:EG11530,EcoGene:EG11530,GeneID:946327;Name=fadD;gbkey=Gene;gene=fadD;gene_biotype=protein_coding;gene_synonym=ECK1803,oldD;locus_tag=b1805
+NC_000913_3	RefSeq	CDS	4147	5832	.	-	0	ID=cds1827;Parent=gene1887;Dbxref=UniProtKB/Swiss-Prot:P69451,Genbank:NP_416319.1,ASAP:ABE-0006005,ECOCYC:EG11530,EcoGene:EG11530,GeneID:946327;Name=NP_416319.1;gbkey=CDS;gene=fadD;orig_transcript_id=gnl|b1805|mrna.b1805;product=fatty acyl-CoA synthetase;protein_id=NP_416319.1;transl_table=11
+NC_000913_3	RefSeq	gene	5833	7977	.	-	.	ID=gene2441;Dbxref=ASAP:ABE-0007723,ECOCYC:G7212,EcoGene:EG14127,GeneID:949097;Name=fadJ;gbkey=Gene;gene=fadJ;gene_biotype=protein_coding;gene_synonym=ECK2335,yfcX;locus_tag=b2341
+NC_000913_3	RefSeq	CDS	5833	7977	.	-	0	ID=cds2356;Parent=gene2441;Dbxref=UniProtKB/Swiss-Prot:P77399,Genbank:NP_416843.1,ASAP:ABE-0007723,ECOCYC:G7212,EcoGene:EG14127,GeneID:949097;Name=NP_416843.1;gbkey=CDS;gene=fadJ;orig_transcript_id=gnl|b2341|mrna.b2341;product=3-hydroxyacyl-CoA dehydrogenase FadJ;protein_id=NP_416843.1;transl_table=11
+NC_000913_3	RefSeq	gene	7978	9288	.	-	.	ID=gene2442;Dbxref=ASAP:ABE-0007725,ECOCYC:G7213,EcoGene:EG14128,GeneID:948823;Name=fadI;gbkey=Gene;gene=fadI;gene_biotype=protein_coding;gene_synonym=ECK2336,yfcY;locus_tag=b2342
+NC_000913_3	RefSeq	CDS	7978	9288	.	-	0	ID=cds2357;Parent=gene2442;Dbxref=UniProtKB/Swiss-Prot:P76503,Genbank:NP_416844.1,ASAP:ABE-0007725,ECOCYC:G7213,EcoGene:EG14128,GeneID:948823;Name=NP_416844.1;gbkey=CDS;gene=fadI;orig_transcript_id=gnl|b2342|mrna.b2342;product=3-ketoacyl-CoA thiolase FadI;protein_id=NP_416844.1;transl_table=11
+NC_000913_3	RefSeq	gene	9289	10452	.	-	.	ID=gene3987;Dbxref=ASAP:ABE-0012562,ECOCYC:EG10278,EcoGene:EG10278,GeneID:948324;Name=fadA;gbkey=Gene;gene=fadA;gene_biotype=protein_coding;gene_synonym=ECK3837,oldA;locus_tag=b3845
+NC_000913_3	RefSeq	CDS	9289	10452	.	-	0	ID=cds3816;Parent=gene3987;Dbxref=UniProtKB/Swiss-Prot:P21151,Genbank:YP_026272.1,ASAP:ABE-0012562,ECOCYC:EG10278,EcoGene:EG10278,GeneID:948324;Name=YP_026272.1;gbkey=CDS;gene=fadA;orig_transcript_id=gnl|b3845|mrna.b3845;product=3-ketoacyl-CoA thiolase;protein_id=YP_026272.1;transl_table=11
+NC_000913_3	RefSeq	gene	10453	12642	.	-	.	ID=gene3988;Dbxref=ASAP:ABE-0012564,ECOCYC:EG10279,EcoGene:EG10279,GeneID:948336;Name=fadB;gbkey=Gene;gene=fadB;gene_biotype=protein_coding;gene_synonym=ECK3838,oldB;locus_tag=b3846
+NC_000913_3	RefSeq	CDS	10453	12642	.	-	0	ID=cds3817;Parent=gene3988;Dbxref=UniProtKB/Swiss-Prot:P21177,Genbank:NP_418288.1,ASAP:ABE-0012564,ECOCYC:EG10279,EcoGene:EG10279,GeneID:948336;Name=NP_418288.1;gbkey=CDS;gene=fadB;orig_transcript_id=gnl|b3846|mrna.b3846;product=dodecenoyl-CoA delta-isomerase%2C enoyl-CoA hydratase%2C 3-hydroxybutyryl-CoA epimerase%2C 3-hydroxyacyl-CoA dehydrogenase;protein_id=NP_418288.1;transl_table=11
diff --git a/test/mpwt_test.py b/test/test_mpwt_pathway_tools.py
similarity index 93%
rename from test/mpwt_test.py
rename to test/test_mpwt_pathway_tools.py
index 845a5cf..234d2de 100755
--- a/test/mpwt_test.py
+++ b/test/test_mpwt_pathway_tools.py
@@ -36,7 +36,9 @@ def test_multiprocess_pwt_import():
     """
     mpwt.remove_pgdbs('fatty_acid_beta_oxydation_icyc,fatty_acid_beta_oxydation_i_gffcyc,fatty_acid_beta_oxydation_i_pfcyc')
     mpwt.cleaning_input('test')
-    mpwt.multiprocess_pwt('test', 'test_output', patho_inference=True, dat_creation=True, dat_extraction=True, size_reduction=False, verbose=True)
+
+    mpwt.create_pathologic_file('test', 'test_pf')
+    mpwt.multiprocess_pwt('test_pf', 'test_output', patho_inference=True, dat_creation=True, dat_extraction=True, size_reduction=False, verbose=True)
 
     pathway_fabo_pathname = "test_output/fatty_acid_beta_oxydation_I_gff/pathways.dat"
     expected_tca_reactions = reaction_extraction(pathway_fabo_pathname)
@@ -51,6 +53,7 @@ def test_multiprocess_pwt_import():
     assert set(fabo_reactions()).issubset(set(expected_pf_fabo_reactions))
 
     mpwt.cleaning_input('test')
+    shutil.rmtree('test_pf')
     shutil.rmtree('test_output')
     shutil.rmtree('__pycache__')
 

From 1f0bbd46bb86c5f56228ab87a1f426b5fa431307 Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Fri, 22 Nov 2019 16:44:01 +0100
Subject: [PATCH 08/15] Add topf in Readme (issue #30).

---
 README.rst | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/README.rst b/README.rst
index 60d5c27..f9c2b53 100755
--- a/README.rst
+++ b/README.rst
@@ -326,6 +326,21 @@ mpwt can be used in a python script with an import:
 |          -v             | verbose(boolean)                               | Print some information about the processing of mpwt                     |
 +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
 
+There is also another argument:
+
+.. code:: sh
+
+    mpwt topf -f input_folder -o output_folder -c cpu_number
+
+.. code:: python
+
+    import mpwt
+    mpwt.create_pathologic_file(input_folder, output_folder, cpu_number)
+
+This argument reads the input data inside the input folder. Then it converts Genbank and GFF files into PathoLogic Format files. And if there is already PathoLogic files it copies them.
+
+It can be used to avoid issues with parsing Genbank and GFF files. But it is an early Work in Progress.
+
 Examples
 ~~~~~~~~
 
@@ -356,6 +371,21 @@ Create PGDBs of studied organisms inside ptools-local:
         mpwt.multiprocess_pwt(input_folder='path/to/folder/input',
                 patho_inference=True)
 
+Convert Genbank and GFF files into PathoLogic files then create PGDBs of studied organisms inside ptools-local:
+
+..
+
+    .. code:: sh
+
+        mpwt topf -f path/to/folder/input -o path/to/folder/pf
+        mpwt -f path/to/folder/pf --patho
+
+    .. code:: python
+
+        import mpwt
+        mpwt.create_pathologic_file(input_folder='path/to/folder/input', output_folder='path/to/folder/pf')
+        mpwt.multiprocess_pwt(input_folder='path/to/folder/pf', patho_inference=True)
+
 Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon Predictor and without loading PubMed citations:
 
 ..

From 12c94ce988352a61e4c066582922a3c1cb2efe8f Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Tue, 26 Nov 2019 13:15:12 +0100
Subject: [PATCH 09/15] Fix issue with PathoLogic file creation when using GFF
 (issue #30). Fix link in Readme.

---
 README.rst    | 2 +-
 mpwt/utils.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index f9c2b53..8c8e8f8 100755
--- a/README.rst
+++ b/README.rst
@@ -151,7 +151,7 @@ PF file example:
     INTRON	START1-STOP1
     //
 
-Look at the `Pathologic format <http://bioinformatics.ai.sri.com/ptools/tpal.pf/>`__ for more informations.
+Look at the `Pathologic format <http://bioinformatics.ai.sri.com/ptools/tpal.pf>`__ for more informations.
 
 You have to provide one nucleotide sequence for each pathologic containing one scaffold/contig.
 
diff --git a/mpwt/utils.py b/mpwt/utils.py
index 7e4b682..864cec2 100755
--- a/mpwt/utils.py
+++ b/mpwt/utils.py
@@ -367,8 +367,7 @@ def run_create_pathologic_file(multiprocessing_input_data):
                                 if 'ec_number' in child.attributes:
                                     for ec in child.attributes['ec_number']:
                                         element_file.write('EC\t' + ec + '\n')
-
-                element_file.write('//\n\n')
+                            element_file.write('//\n\n')
 
     elif all([True for species_file in os.listdir(input_path) if '.pf' in species_file or '.fasta' in species_file]):
         taxon_id = multiprocessing_input_data['taxon_id']

From d7154f2c08f48f788fd8dd2567740f4d743b66b6 Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Tue, 26 Nov 2019 14:57:00 +0100
Subject: [PATCH 10/15] Fix issue with GO terms in topf (issue #30).

---
 mpwt/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mpwt/utils.py b/mpwt/utils.py
index 864cec2..3c8b5c1 100755
--- a/mpwt/utils.py
+++ b/mpwt/utils.py
@@ -312,10 +312,10 @@ def run_create_pathologic_file(multiprocessing_input_data):
                             for go in feature.qualifiers['go_component']:
                                 element_file.write('GO\t' + go + '\n')
                         if 'go_function' in feature.qualifiers:
-                            for go in feature.qualifiers['go_component']:
+                            for go in feature.qualifiers['go_function']:
                                 element_file.write('GO\t' + go + '\n')
                         if 'go_process' in feature.qualifiers:
-                            for go in feature.qualifiers['go_component']:
+                            for go in feature.qualifiers['go_process']:
                                 element_file.write('GO\t' + go + '\n')
                         element_file.write('PRODUCT-TYPE\tP' + '\n')
                         if gene_id:

From 2d9326746a4f75424a990422b8050ec9c98098f4 Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Mon, 6 Jan 2020 16:48:23 +0100
Subject: [PATCH 11/15] Add option to create taxon_file.tsv (issue #35). Update
 function modifying ptools-init.dat (issue #34).

---
 README.rst               |  8 +++----
 mpwt/__main__.py         |  4 ++--
 mpwt/mpwt_workflow.py    | 36 ++++++++++++++++++++---------
 mpwt/pathologic_input.py | 49 ++++++++++++++++++++++++++++++++++++++++
 mpwt/results_check.py    |  2 +-
 mpwt/utils.py            |  9 ++++----
 6 files changed, 86 insertions(+), 22 deletions(-)

diff --git a/README.rst b/README.rst
index 8c8e8f8..f4a5f2b 100755
--- a/README.rst
+++ b/README.rst
@@ -280,7 +280,7 @@ mpwt can be used in a python script with an import:
 			  patho_inference=optional_boolean,
 			  patho_hole_filler=optional_boolean,
               patho_operon_predictor=optional_boolean,
-              patho_citations=optional_boolean,
+              no_download_articles=optional_boolean,
 			  dat_creation=optional_boolean,
 			  dat_extraction=optional_boolean,
 			  size_reduction=optional_boolean,
@@ -303,7 +303,7 @@ mpwt can be used in a python script with an import:
 +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
 |          --op           | patho_operon_predictor(boolean)                | Launch PathoLogic Operon Predictor                                      |
 +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
-|          --nc           | patho_citations(boolean)                       | Launch PathoLogic without loading PubMed citations                      |
+|          --nc           | no_download_articles(boolean)                  | Launch PathoLogic without loading PubMed citations                      |
 +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
 |          --dat          | dat_creation(boolean)                          | Create BioPAX/attribute-value dat files                                 |
 +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
@@ -386,7 +386,7 @@ Convert Genbank and GFF files into PathoLogic files then create PGDBs of studied
         mpwt.create_pathologic_file(input_folder='path/to/folder/input', output_folder='path/to/folder/pf')
         mpwt.multiprocess_pwt(input_folder='path/to/folder/pf', patho_inference=True)
 
-Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon Predictor and without loading PubMed citations:
+Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon Predictor and without loading PubMed citations (need Pathway Tools 23.5 or higher):
 
 ..
 
@@ -401,7 +401,7 @@ Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon P
                 patho_inference=True,
                 patho_hole_filler=True,
                 patho_operon_predictor=True,
-                patho_citations=True,
+                no_download_articles=True,
                 patho_log='path/to/folder/log')
 
 Create PGDBs of studied organisms inside ptools-local and create dat files:
diff --git a/mpwt/__main__.py b/mpwt/__main__.py
index bcd3728..44f4187 100755
--- a/mpwt/__main__.py
+++ b/mpwt/__main__.py
@@ -63,7 +63,7 @@ def run_mpwt():
     patho_inference = args['--patho']
     patho_hole_filler = args['--hf']
     patho_operon_predictor = args['--op']
-    patho_citations = args['--nc']
+    no_download_articles = args['--nc']
     dat_creation = args['--dat']
     move_dat = args['--md']
     size_reduction = args['-r']
@@ -116,7 +116,7 @@ def run_mpwt():
                     patho_inference=patho_inference,
                     patho_hole_filler=patho_hole_filler,
                     patho_operon_predictor=patho_operon_predictor,
-                    patho_citations=patho_citations,
+                    no_download_articles=no_download_articles,
                     dat_creation=dat_creation,
                     dat_extraction=move_dat,
                     size_reduction=size_reduction,
diff --git a/mpwt/mpwt_workflow.py b/mpwt/mpwt_workflow.py
index 0192073..d25e886 100755
--- a/mpwt/mpwt_workflow.py
+++ b/mpwt/mpwt_workflow.py
@@ -5,6 +5,7 @@
     -check the results (results_check)
 """
 
+import csv
 import logging
 import os
 import shutil
@@ -14,7 +15,7 @@
 from mpwt import utils
 from mpwt.pwt_wrapper import run_pwt, run_pwt_dat, run_move_pgdb
 from mpwt.results_check import check_dat, check_pwt, permission_change
-from mpwt.pathologic_input import check_input_and_existing_pgdb, create_mpwt_input, pwt_input_files, create_only_dat_lisp, create_dat_creation_script
+from mpwt.pathologic_input import check_input_and_existing_pgdb, create_mpwt_input, pwt_input_files, create_only_dat_lisp, create_dat_creation_script, read_taxon_id
 from multiprocessing import Pool
 
 logging.basicConfig(format='%(message)s', level=logging.CRITICAL)
@@ -23,10 +24,10 @@
 
 
 def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None,
-                     patho_hole_filler=None, patho_operon_predictor=None, patho_citations=None,
+                     patho_hole_filler=None, patho_operon_predictor=None, no_download_articles=None,
                      dat_creation=None, dat_extraction=None, size_reduction=None,
                      number_cpu=None, patho_log=None, ignore_error=None,
-                     taxon_file=None, turn_off_citations=None, verbose=None):
+                     taxon_file=None, verbose=None):
     """
     Function managing all the workflow (from the creatin of the input files to the results).
     Use it when you import mpwt in a script.
@@ -37,7 +38,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
         patho_inference (bool): PathoLogic inference (True/False)
         patho_hole_filler (bool): PathoLogic hole filler (True/False)
         patho_operon_predictor (bool): PathoLogic operon predictor (True/False)
-        patho_citations (bool): turning off loading of PubMed citations (True/False)
+        no_download_articles (bool): turning off loading of PubMed citations (True/False)
         dat_creation (bool): BioPAX/attributes-values files creation (True/False)
         dat_extraction (bool): BioPAX/attributes-values files extraction (True/False)
         size_reduction (bool): delete ptools-local data at the end (True/False)
@@ -72,16 +73,16 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
         sys.exit('To use --ignore-error/ignore_error, you need to use the --patho/patho_inference argument.')
 
     # Check if taxon_file is used with patho_inference.
-    if taxon_file and not patho_inference:
-        sys.exit('To use --taxon-file/taxon_file, you need to use the --patho/patho_inference argument.')
+    if (taxon_file and not patho_inference) and (taxon_file and not input_folder):
+        sys.exit('To use --taxon-file/taxon_file, you need to use the --patho/patho_inference argument. Or you can use it with the -f argument to create the taxon file from data.')
 
     #Check if patho_operon_predictor is used with patho_inference.
     if patho_operon_predictor and not patho_inference:
         sys.exit('To use --op/patho_operon_predictor, you need to use the --patho/patho_inference argument.')
 
-    #Check if patho_citations is used with patho_inference.
-    if patho_citations and not patho_inference:
-        sys.exit('To use --nc/patho_citations, you need to use the --patho/patho_inference argument.')
+    #Check if no_download_articles is used with patho_inference.
+    if no_download_articles and not patho_inference:
+        sys.exit('To use --nc/no_download_articles, you need to use the --patho/patho_inference argument.')
 
     # Use the number of cpu given by the user or 1 CPU.
     if number_cpu:
@@ -93,8 +94,21 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
         number_cpu_to_use = 1
     mpwt_pool = Pool(processes=number_cpu_to_use)
 
+    # Create taxon file in the input folder.
+    if taxon_file and input_folder and not patho_inference:
+        taxon_file_pathname = input_folder + '/taxon_id.tsv'
+        if os.path.exists(taxon_file_pathname):
+            sys.exit('taxon ID file (' + taxon_file_pathname + ') already exists.')
+        else:
+            taxon_ids = read_taxon_id(input_folder)
+            with open(taxon_file_pathname, 'w') as taxon_id_file:
+                taxon_id_writer = csv.writer(taxon_id_file, delimiter='\t')
+                taxon_id_writer.writerow(['species', 'taxon_id'])
+                for species, taxon_id in taxon_ids.items():
+                    taxon_id_writer.writerow([species, taxon_id])
+
     # Turn off loading of pubmed entries.
-    if patho_citations:
+    if no_download_articles:
         utils.pubmed_citations(activate_citations=False)
 
     # Check input folder and create input files for PathoLogic.
@@ -217,7 +231,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
     mpwt_pool.join()
 
     # Turn on loading of pubmed entries.
-    if patho_citations:
+    if no_download_articles:
         utils.pubmed_citations(activate_citations=True)
 
     end_time = time.time()
diff --git a/mpwt/pathologic_input.py b/mpwt/pathologic_input.py
index 305a757..43c7493 100755
--- a/mpwt/pathologic_input.py
+++ b/mpwt/pathologic_input.py
@@ -394,6 +394,55 @@ def create_dats_and_lisp(run_folder, taxon_file):
     return all([os.path.isfile(organism_dat), os.path.isfile(genetic_dat), check_lisp_file])
 
 
+def read_taxon_id(run_folder):
+    taxon_ids = {}
+
+    for input_folder in os.listdir(run_folder):
+        for input_file in os.listdir(run_folder + '/' + input_folder):
+            if '.gbk' in input_file:
+                gbk_pathname = run_folder + '/' + input_folder + '/' + input_file
+                # Take the species name and the taxon id from the genbank file.
+                with open(gbk_pathname, "r") as gbk:
+                    # Take the first record of the genbank (first contig/chromosome) to retrieve the species name.
+                    first_seq_record = next(SeqIO.parse(gbk, "genbank"))
+                    # Take the source feature of the first record.
+                    # This feature contains the taxon ID in the db_xref qualifier.
+                    src_features = [feature for feature in first_seq_record.features if feature.type == "source"]
+                    for src_feature in src_features:
+                        try:
+                            src_dbxref_qualifiers = src_feature.qualifiers['db_xref']
+                            for src_dbxref_qualifier in src_dbxref_qualifiers:
+                                if 'taxon:' in src_dbxref_qualifier:
+                                    taxon_id = src_dbxref_qualifier.replace('taxon:', '')
+                        except KeyError:
+                            logger.info('No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'.format(gbk_pathname))
+
+            elif '.gff' in input_file:
+                gff_pathname = run_folder + '/' + input_folder + '/' + input_file
+
+                # Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature.
+                try:
+                    region_feature = [feature for feature in DataIterator(gff_pathname) if feature.featuretype == 'region'][0]
+                except IndexError:
+                    raise IndexError('No region feature in the GFF file of {0}, GFF file must have region features.'.format(input_folder))
+
+                try:
+                    region_feature.attributes['Dbxref']
+                except KeyError:
+                    raise KeyError('No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.'.format(input_folder))
+
+                for dbxref in region_feature.attributes['Dbxref']:
+                    if 'taxon' in dbxref:
+                        taxon_id = dbxref.split('taxon:')[1]
+
+            elif '.pf' in input_file:
+                logger.info('No taxon ID associated to a PathoLogic Format. {0} will have a missing taxon_id'.format(input_folder))
+                taxon_id = "missing"
+        taxon_ids[input_folder] = taxon_id
+
+    return taxon_ids
+
+
 def pwt_input_files(multiprocess_input):
     """
     Check if files needed by Pathway Tools are available, if not create them.
diff --git a/mpwt/results_check.py b/mpwt/results_check.py
index ad1b5f7..a2d3f1b 100755
--- a/mpwt/results_check.py
+++ b/mpwt/results_check.py
@@ -70,7 +70,7 @@ def check_pwt(multiprocess_inputs, patho_log_folder):
                             if patho_log_folder:
                                 patho_error_file.write(line)
 
-                    if 'Build done.' in  line:
+                    if 'Build done.' in line or 'PGDB build done.' in line:
                         if patho_log_folder:
                             patho_error_file.write(line)
                             resume_inference_line = next(input_file)
diff --git a/mpwt/utils.py b/mpwt/utils.py
index 3c8b5c1..e89d61a 100755
--- a/mpwt/utils.py
+++ b/mpwt/utils.py
@@ -379,17 +379,18 @@ def pubmed_citations(activate_citations):
     """
     Activate or deactivate loading of PubMed citations.
 
-    TODO: update this function with the argument from the new version of Pathway Tools
+    Args:
+    activate_citations (bool): boolean to indicate if you want to activate or not the downlaod of Pubmed entries.
     """
     ptools_init_filepath = find_ptools_path() + '/ptools-init.dat'
     new_ptools_file = ""
     with open(ptools_init_filepath, 'r') as ptools_init_file:
         for line in ptools_init_file.read().split('\n'):
-            if '##download-pubmed-citations' in line:
+            if '###Batch-PathoLogic-Download-Pubmed-Entries?' in line:
                 if activate_citations:
-                    line = line.replace('N', 'Y')
+                    line = line.replace('F', 'T')
                 else:
-                    line = line.replace('Y', 'N')
+                    line = line.replace('T', 'F')
             if line != '':
                 new_ptools_file = new_ptools_file + line + '\n'
             else:

From e0d450ba5c6ce42676893ea411b49d0e5bbe3185 Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Tue, 7 Jan 2020 13:56:17 +0100
Subject: [PATCH 12/15] Add errors and warnings counts in log.

---
 mpwt/results_check.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/mpwt/results_check.py b/mpwt/results_check.py
index a2d3f1b..a7e5c66 100755
--- a/mpwt/results_check.py
+++ b/mpwt/results_check.py
@@ -37,7 +37,7 @@ def check_pwt(multiprocess_inputs, patho_log_folder):
         patho_error_file = open(patho_error_pathname, 'w')
         patho_resume_file = open(patho_resume_pathname, 'w')
         patho_resume_writer = csv.writer(patho_resume_file, delimiter='\t', lineterminator='\n')
-        patho_resume_writer.writerow(['species', 'gene_number', 'protein_number', 'pathway_number', 'reaction_number', 'compound_number'])
+        patho_resume_writer.writerow(['species', 'gene_number', 'protein_number', 'pathway_number', 'reaction_number', 'compound_number', 'pwt_non_fatal_error', 'pwt_warning'])
 
     failed_inferences = []
     passed_inferences = []
@@ -53,11 +53,17 @@ def check_pwt(multiprocess_inputs, patho_log_folder):
             patho_error_file.write('\n')
 
         fatal_error_index = None
+        non_fatal_error_count = 0
+        warning_count = 0
 
         if os.path.exists(patho_log):
             with open(patho_log, 'r') as input_file:
                 for index, line in enumerate(input_file):
-                    if 'fatal error' in line or 'Error' in line:
+                    if ';;; Error:' in line:
+                        non_fatal_error_count += 1
+                    if 'Warning:' in line:
+                        warning_count += 1
+                    if 'fatal error' in line:
                         fatal_error_index = index
                         if species not in failed_inferences:
                             failed_inferences.append(species)
@@ -75,12 +81,18 @@ def check_pwt(multiprocess_inputs, patho_log_folder):
                             patho_error_file.write(line)
                             resume_inference_line = next(input_file)
                             patho_error_file.write(resume_inference_line)
+                            if non_fatal_error_count > 0:
+                                non_fatal_error_line = 'Number of non fatal errors: ' + str(non_fatal_error_count) + '. More information in ' + patho_log + '.\n'
+                                patho_error_file.write(non_fatal_error_line)
+                            if warning_count > 0:
+                                warning_line = 'Number of warning: ' + str(warning_count) + '. More information in ' + patho_log + '.\n'
+                                patho_error_file.write(warning_line)
                             gene_number = int(resume_inference_line.split('PGDB contains ')[1].split(' genes')[0])
                             protein_number = int(resume_inference_line.split('genes, ')[1].split(' proteins')[0])
                             pathway_number = int(resume_inference_line.split('proteins, ')[1].split(' base pathways')[0])
                             reaction_number = int(resume_inference_line.split('base pathways, ')[1].split(' reactions')[0])
                             compound_number = int(resume_inference_line.split('reactions, ')[1].split(' compounds')[0])
-                            patho_resume_writer.writerow([species, gene_number, protein_number, pathway_number, reaction_number, compound_number])
+                            patho_resume_writer.writerow([species, gene_number, protein_number, pathway_number, reaction_number, compound_number, non_fatal_error_count, warning_count])
 
                         passed_inferences.append(species)
                 if species not in passed_inferences and species not in failed_inferences:

From 000e48f7499d8ef2c81836dd9869b6e14d8db742 Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Tue, 7 Jan 2020 14:48:09 +0100
Subject: [PATCH 13/15] Add error message if --nc is used with wrong PWT
 version (issue #34).

---
 mpwt/utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mpwt/utils.py b/mpwt/utils.py
index e89d61a..8144812 100755
--- a/mpwt/utils.py
+++ b/mpwt/utils.py
@@ -384,9 +384,12 @@ def pubmed_citations(activate_citations):
     """
     ptools_init_filepath = find_ptools_path() + '/ptools-init.dat'
     new_ptools_file = ""
+
+    download_pubmed_entries_parameter = None
     with open(ptools_init_filepath, 'r') as ptools_init_file:
         for line in ptools_init_file.read().split('\n'):
             if '###Batch-PathoLogic-Download-Pubmed-Entries?' in line:
+                download_pubmed_entries_parameter = True
                 if activate_citations:
                     line = line.replace('F', 'T')
                 else:
@@ -396,5 +399,8 @@ def pubmed_citations(activate_citations):
             else:
                 new_ptools_file = new_ptools_file + line
 
+    if not download_pubmed_entries_parameter:
+        sys.exit('There is no Batch-PathoLogic-Download-Pubmed-Entries parameter in ' + ptools_init_filepath +'. To use --nc/no_download_articles, mpwt needs Pathway Tools 23.5 or higher.')
+
     with open(ptools_init_filepath, 'w') as ptools_init_file:
         ptools_init_file.write(new_ptools_file)

From 53025e4d211a474792f6b0c097df8fb47174d6cb Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Thu, 9 Jan 2020 14:26:02 +0100
Subject: [PATCH 14/15] Uncomment argument in ptools-init.dat file to use them.

---
 mpwt/utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mpwt/utils.py b/mpwt/utils.py
index 8144812..825f9d7 100755
--- a/mpwt/utils.py
+++ b/mpwt/utils.py
@@ -388,12 +388,14 @@ def pubmed_citations(activate_citations):
     download_pubmed_entries_parameter = None
     with open(ptools_init_filepath, 'r') as ptools_init_file:
         for line in ptools_init_file.read().split('\n'):
-            if '###Batch-PathoLogic-Download-Pubmed-Entries?' in line:
+            if 'Batch-PathoLogic-Download-Pubmed-Entries?' in line:
+                if '#' in line:
+                    line = line.replace('#', '')
                 download_pubmed_entries_parameter = True
                 if activate_citations:
-                    line = line.replace('F', 'T')
+                    line = line.replace('nil', 'T')
                 else:
-                    line = line.replace('T', 'F')
+                    line = line.replace('T', 'nil')
             if line != '':
                 new_ptools_file = new_ptools_file + line + '\n'
             else:

From 98966970b67c84d2eb7b503317a0160ac9a9088a Mon Sep 17 00:00:00 2001
From: Arnaud Belcour <arnaud.belcour@irisa.fr>
Date: Thu, 9 Jan 2020 15:25:03 +0100
Subject: [PATCH 15/15] Move to 0.5.3.

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 5d95990..bd15101 100755
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 setup(name='mpwt',
       description='Multiprocessing for Pathway Tools',
       long_description=long_description,
-      version='0.5.2',
+      version='0.5.3',
       url='https://github.com/AuReMe/mpwt',
       author='A. Belcour',
       author_email='arnaud.belcour@gmail.com',