From b6f0960cf89778d66d38668a044f962c6afef089 Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Mon, 18 Nov 2019 15:04:19 +0100 Subject: [PATCH 01/15] Add draft for turning off loading of citations. --- mpwt/__init__.py | 2 +- mpwt/__main__.py | 5 ++++- mpwt/mpwt_workflow.py | 15 ++++++++++++++- mpwt/utils.py | 16 +++++++++++++++- 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/mpwt/__init__.py b/mpwt/__init__.py index 151de1e..08e2354 100755 --- a/mpwt/__init__.py +++ b/mpwt/__init__.py @@ -1,3 +1,3 @@ from mpwt.pwt_wrapper import run_pwt, run_pwt_dat from mpwt.mpwt_workflow import multiprocess_pwt -from mpwt.utils import cleaning, cleaning_input, find_ptools_path, list_pgdb, remove_pgdbs +from mpwt.utils import cleaning, cleaning_input, find_ptools_path, list_pgdb, pubmed_citations, remove_pgdbs diff --git a/mpwt/__main__.py b/mpwt/__main__.py index c35fc3a..f819fa7 100755 --- a/mpwt/__main__.py +++ b/mpwt/__main__.py @@ -7,7 +7,7 @@ The script takes a folder name as argument. usage: - mpwt -f=DIR [-o=DIR] [--patho] [--hf] [--dat] [--md] [--cpu=INT] [-r] [-v] [--clean] [--log=FOLDER] [--ignore-error] [--taxon-file] + mpwt -f=DIR [-o=DIR] [--patho] [--hf] [--dat] [--md] [--cpu=INT] [-r] [--nc] [-v] [--clean] [--log=FOLDER] [--ignore-error] [--taxon-file] mpwt --dat [-f=DIR] [-o=DIR] [--md] [--cpu=INT] [-v] mpwt -o=DIR [--md] [--cpu=INT] [-v] mpwt --clean [--cpu=INT] [-v] @@ -21,6 +21,7 @@ -o=DIR Output folder path. Will create a output folder in this folder. --patho Will run an inference of Pathologic on the input files. --hf Use with --patho. Run the Hole Filler using Blast. + --nc Turn off loading of Pubmed entries. --dat Will create BioPAX/attribute-value dat files from PGDB. --md Move only the dat files into the output folder. --clean Clean ptools-local folder, before any other operations. @@ -69,6 +70,7 @@ def run_mpwt(): pgdb_list = args['--list'] ignore_error = args['--ignore-error'] taxon_file = args['--taxon-file'] + turn_off_citations = args['--nc'] verbose = args['-v'] topf = args['topf'] @@ -118,6 +120,7 @@ def run_mpwt(): patho_log=patho_log, ignore_error=ignore_error, taxon_file=taxon_file, + turn_off_citations=turn_off_citations, verbose=verbose) diff --git a/mpwt/mpwt_workflow.py b/mpwt/mpwt_workflow.py index 10616f1..cdec7a3 100755 --- a/mpwt/mpwt_workflow.py +++ b/mpwt/mpwt_workflow.py @@ -25,7 +25,8 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None, patho_hole_filler=None, dat_creation=None, dat_extraction=None, size_reduction=None, number_cpu=None, patho_log=None, - ignore_error=None, taxon_file=None, verbose=None): + ignore_error=None, taxon_file=None, turn_off_citations=None, + verbose=None): """ Function managing all the workflow (from the creatin of the input files to the results). Use it when you import mpwt in a script. @@ -72,6 +73,10 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None if taxon_file and not patho_inference: sys.exit('To use --taxon-file/taxon_file, you need to use the --patho/patho_inference argument.') + #Check if turn_off_citations is used with patho_inference. + if turn_off_citations and not patho_inference: + sys.exit('To use --nc/turn_off_citations, you need to use the --patho/patho_inference argument.') + # Use the number of cpu given by the user or 1 CPU. if number_cpu: try: @@ -82,6 +87,10 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None number_cpu_to_use = 1 mpwt_pool = Pool(processes=number_cpu_to_use) + # Turn off loading of pubmed entries. + if turn_off_citations: + utils.pubmed_citations(activate_citations=False) + # Check input folder and create input files for PathoLogic. if input_folder: run_ids = [folder_id for folder_id in next(os.walk(input_folder))[1]] @@ -198,6 +207,10 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None mpwt_pool.close() mpwt_pool.join() + # Turn on loading of pubmed entries. + if turn_off_citations: + utils.pubmed_citations(activate_citations=True) + end_time = time.time() times.append(end_time) steps.append('mpwt') diff --git a/mpwt/utils.py b/mpwt/utils.py index 03ca659..b373e58 100755 --- a/mpwt/utils.py +++ b/mpwt/utils.py @@ -330,4 +330,18 @@ def run_create_pathologic_file(multiprocessing_input_data): element_file.write('//\n\n') - \ No newline at end of file + +def pubmed_citations(activate_citations): + ptools_init_filepath = find_ptools_path() + '/ptools-init.dat' + new_ptools_file = "" + with open(ptools_init_filepath, 'r') as ptools_init_file: + for line in ptools_init_file.read().split('\n'): + if '##download-pubmed-citations' in line: + if activate_citations: + line = line.replace('N', 'Y') + elif activate_citations == False: + line = line.replace('Y', 'N') + new_ptools_file = new_ptools_file + line + '\n' + + with open(ptools_init_filepath, 'w') as ptools_init_file: + ptools_init_file.write(new_ptools_file) From da8a77478b516878c4f5784f5eb4cd01a7aaf315 Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Mon, 18 Nov 2019 17:29:26 +0100 Subject: [PATCH 02/15] Fix issue with variable name in topf function (issue #30). --- mpwt/utils.py | 111 ++++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 54 deletions(-) diff --git a/mpwt/utils.py b/mpwt/utils.py index b373e58..d42ae70 100755 --- a/mpwt/utils.py +++ b/mpwt/utils.py @@ -216,7 +216,7 @@ def run_create_pathologic_file(multiprocessing_input_data): if 'taxon:' in src_dbxref_qualifier: taxon_id = src_dbxref_qualifier.replace('taxon:', '') except KeyError: - logger.info('No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'.format(genbank_folder)) + logger.info('No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'.format(input_path)) if taxon_id: if not os.path.exists(output_folder + '/taxon_id.tsv'): with open(output_folder + '/taxon_id.tsv', 'w') as taxon_id_file: @@ -228,59 +228,59 @@ def run_create_pathologic_file(multiprocessing_input_data): taxon_writer = csv.writer(taxon_id_file, delimiter='\t') taxon_writer.writerow([input_name, taxon_id]) - for record in SeqIO.parse(input_path, 'genbank'): - element_id = record.id - records = [record] - SeqIO.write(records, output_path + '/' + element_id + '.fasta', 'fasta') - with open(output_path + '/' + element_id + '.pf', 'w') as element_file: - element_file.write(';;;;;;;;;;;;;;;;;;;;;;;;;\n') - element_file.write(';; ' + element_id + '\n') - element_file.write(';;;;;;;;;;;;;;;;;;;;;;;;;\n') - for feature in record.features: - if feature.type == 'CDS': - gene_name = None - gene_id = None - if 'locus_tag' in feature.qualifiers: - gene_id = feature.qualifiers['locus_tag'][0] - if 'gene' in feature.qualifiers: - gene_name = feature.qualifiers['gene'][0] - if not gene_id and not gene_name: - logger.critical('No locus_tag and no gene qualifiers in feature of record: ' + record.id) - pass - if gene_id: - element_file.write('ID\t' + gene_id + '\n') - else: - if gene_name: - element_file.write('ID\t' + gene_name + '\n') + for record in SeqIO.parse(input_path, 'genbank'): + element_id = record.id + records = [record] + SeqIO.write(records, output_path + '/' + element_id + '.fasta', 'fasta') + with open(output_path + '/' + element_id + '.pf', 'w') as element_file: + element_file.write(';;;;;;;;;;;;;;;;;;;;;;;;;\n') + element_file.write(';; ' + element_id + '\n') + element_file.write(';;;;;;;;;;;;;;;;;;;;;;;;;\n') + for feature in record.features: + if feature.type == 'CDS': + gene_name = None + gene_id = None + if 'locus_tag' in feature.qualifiers: + gene_id = feature.qualifiers['locus_tag'][0] + if 'gene' in feature.qualifiers: + gene_name = feature.qualifiers['gene'][0] + if not gene_id and not gene_name: + logger.critical('No locus_tag and no gene qualifiers in feature of record: ' + record.id) + pass + if gene_id: + element_file.write('ID\t' + gene_id + '\n') + else: if gene_name: - element_file.write('NAME\t' + gene_name + '\n') - else: - if gene_id: - element_file.write('NAME\t' + gene_id + '\n') - element_file.write('STARTBASE\t' + str(feature.location.start+1) + '\n') - element_file.write('ENDBASE\t' + str(feature.location.end) + '\n') - if 'function' in feature.qualifiers: - for function in feature.qualifiers['function']: - element_file.write('FUNCTION\t' + function + '\n') - if 'EC_number' in feature.qualifiers: - for ec in feature.qualifiers['EC_number']: - element_file.write('EC\t' + ec + '\n') - if 'go_component' in feature.qualifiers: - for go in feature.qualifiers['go_component']: - element_file.write('GO\t' + go + '\n') - if 'go_function' in feature.qualifiers: - for go in feature.qualifiers['go_component']: - element_file.write('GO\t' + go + '\n') - if 'go_process' in feature.qualifiers: - for go in feature.qualifiers['go_component']: - element_file.write('GO\t' + go + '\n') - element_file.write('PRODUCT-TYPE\tP' + '\n') + element_file.write('ID\t' + gene_name + '\n') + if gene_name: + element_file.write('NAME\t' + gene_name + '\n') + else: if gene_id: - element_file.write('PRODUCT-ID\tprot ' + gene_id + '\n') - else: - if gene_name: - element_file.write('PRODUCT-ID\tprot ' + gene_name + '\n') - element_file.write('//\n\n') + element_file.write('NAME\t' + gene_id + '\n') + element_file.write('STARTBASE\t' + str(feature.location.start+1) + '\n') + element_file.write('ENDBASE\t' + str(feature.location.end) + '\n') + if 'function' in feature.qualifiers: + for function in feature.qualifiers['function']: + element_file.write('FUNCTION\t' + function + '\n') + if 'EC_number' in feature.qualifiers: + for ec in feature.qualifiers['EC_number']: + element_file.write('EC\t' + ec + '\n') + if 'go_component' in feature.qualifiers: + for go in feature.qualifiers['go_component']: + element_file.write('GO\t' + go + '\n') + if 'go_function' in feature.qualifiers: + for go in feature.qualifiers['go_component']: + element_file.write('GO\t' + go + '\n') + if 'go_process' in feature.qualifiers: + for go in feature.qualifiers['go_component']: + element_file.write('GO\t' + go + '\n') + element_file.write('PRODUCT-TYPE\tP' + '\n') + if gene_id: + element_file.write('PRODUCT-ID\tprot ' + gene_id + '\n') + else: + if gene_name: + element_file.write('PRODUCT-ID\tprot ' + gene_name + '\n') + element_file.write('//\n\n') elif input_path.endswith('.gff'): gff_database = gffutils.create_db(input_path, ':memory:', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True) @@ -339,9 +339,12 @@ def pubmed_citations(activate_citations): if '##download-pubmed-citations' in line: if activate_citations: line = line.replace('N', 'Y') - elif activate_citations == False: + else: line = line.replace('Y', 'N') - new_ptools_file = new_ptools_file + line + '\n' + if line != '': + new_ptools_file = new_ptools_file + line + '\n' + else: + new_ptools_file = new_ptools_file + line with open(ptools_init_filepath, 'w') as ptools_init_file: ptools_init_file.write(new_ptools_file) From 23b0815f4c8659f72e69892dd3a7c629ce5abf6e Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Mon, 18 Nov 2019 17:31:17 +0100 Subject: [PATCH 03/15] Add option for operon prediction (issue #33). --- mpwt/__main__.py | 11 +++++++---- mpwt/mpwt_workflow.py | 37 ++++++++++++++++++++++--------------- mpwt/pathologic_input.py | 7 +++++-- mpwt/pwt_wrapper.py | 4 ++++ 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/mpwt/__main__.py b/mpwt/__main__.py index f819fa7..6ef12aa 100755 --- a/mpwt/__main__.py +++ b/mpwt/__main__.py @@ -7,7 +7,7 @@ The script takes a folder name as argument. usage: - mpwt -f=DIR [-o=DIR] [--patho] [--hf] [--dat] [--md] [--cpu=INT] [-r] [--nc] [-v] [--clean] [--log=FOLDER] [--ignore-error] [--taxon-file] + mpwt -f=DIR [-o=DIR] [--patho] [--hf] [--op] [--nc] [--dat] [--md] [--cpu=INT] [-r] [-v] [--clean] [--log=FOLDER] [--ignore-error] [--taxon-file] mpwt --dat [-f=DIR] [-o=DIR] [--md] [--cpu=INT] [-v] mpwt -o=DIR [--md] [--cpu=INT] [-v] mpwt --clean [--cpu=INT] [-v] @@ -21,7 +21,8 @@ -o=DIR Output folder path. Will create a output folder in this folder. --patho Will run an inference of Pathologic on the input files. --hf Use with --patho. Run the Hole Filler using Blast. - --nc Turn off loading of Pubmed entries. + --op Use with --patho. Run the Operon predictor of Pathway-Tools. + --nc Use with --patho. Turn off loading of Pubmed entries. --dat Will create BioPAX/attribute-value dat files from PGDB. --md Move only the dat files into the output folder. --clean Clean ptools-local folder, before any other operations. @@ -61,6 +62,8 @@ def run_mpwt(): output_folder = args['-o'] patho_inference = args['--patho'] patho_hole_filler = args['--hf'] + patho_operon_predictor = args['--op'] + patho_citations = args['--nc'] dat_creation = args['--dat'] move_dat = args['--md'] size_reduction = args['-r'] @@ -70,7 +73,6 @@ def run_mpwt(): pgdb_list = args['--list'] ignore_error = args['--ignore-error'] taxon_file = args['--taxon-file'] - turn_off_citations = args['--nc'] verbose = args['-v'] topf = args['topf'] @@ -113,6 +115,8 @@ def run_mpwt(): output_folder=output_folder, patho_inference=patho_inference, patho_hole_filler=patho_hole_filler, + patho_operon_predictor=patho_operon_predictor, + patho_citations=patho_citations, dat_creation=dat_creation, dat_extraction=move_dat, size_reduction=size_reduction, @@ -120,7 +124,6 @@ def run_mpwt(): patho_log=patho_log, ignore_error=ignore_error, taxon_file=taxon_file, - turn_off_citations=turn_off_citations, verbose=verbose) diff --git a/mpwt/mpwt_workflow.py b/mpwt/mpwt_workflow.py index cdec7a3..d5a6996 100755 --- a/mpwt/mpwt_workflow.py +++ b/mpwt/mpwt_workflow.py @@ -23,10 +23,10 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None, - patho_hole_filler=None, dat_creation=None, dat_extraction=None, - size_reduction=None, number_cpu=None, patho_log=None, - ignore_error=None, taxon_file=None, turn_off_citations=None, - verbose=None): + patho_hole_filler=None, patho_operon_predictor=None, patho_citations=None, + dat_creation=None, dat_extraction=None, size_reduction=None, + number_cpu=None, patho_log=None, ignore_error=None, + taxon_file=None, turn_off_citations=None, verbose=None): """ Function managing all the workflow (from the creatin of the input files to the results). Use it when you import mpwt in a script. @@ -73,9 +73,13 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None if taxon_file and not patho_inference: sys.exit('To use --taxon-file/taxon_file, you need to use the --patho/patho_inference argument.') - #Check if turn_off_citations is used with patho_inference. - if turn_off_citations and not patho_inference: - sys.exit('To use --nc/turn_off_citations, you need to use the --patho/patho_inference argument.') + #Check if patho_operon_predictor is used with patho_inference. + if patho_operon_predictor and not patho_inference: + sys.exit('To use --op/patho_operon_predictor, you need to use the --patho/patho_inference argument.') + + #Check if patho_citations is used with patho_inference. + if patho_citations and not patho_inference: + sys.exit('To use --nc/patho_citations, you need to use the --patho/patho_inference argument.') # Use the number of cpu given by the user or 1 CPU. if number_cpu: @@ -88,7 +92,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None mpwt_pool = Pool(processes=number_cpu_to_use) # Turn off loading of pubmed entries. - if turn_off_citations: + if patho_citations: utils.pubmed_citations(activate_citations=False) # Check input folder and create input files for PathoLogic. @@ -104,8 +108,9 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None if run_patho_dat_ids: # Create the list containing all the data used by the multiprocessing call. multiprocess_inputs = create_mpwt_input(run_ids=run_patho_dat_ids, input_folder=input_folder, pgdbs_folder_path=pgdbs_folder_path, - patho_hole_filler=patho_hole_filler, dat_extraction=dat_extraction, output_folder=output_folder, - size_reduction=size_reduction, only_dat_creation=None, taxon_file=taxon_file) + patho_hole_filler=patho_hole_filler, patho_operon_predictor=patho_operon_predictor, + dat_extraction=dat_extraction, output_folder=output_folder, size_reduction=size_reduction, + only_dat_creation=None, taxon_file=taxon_file) logger.info('~~~~~~~~~~Creation of input data from Genbank/GFF/PF~~~~~~~~~~') mpwt_pool.map(pwt_input_files, multiprocess_inputs) @@ -149,8 +154,9 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None dat_run_ids = create_only_dat_lisp(pgdbs_folder_path, tmp_folder) multiprocess_inputs = create_mpwt_input(run_ids=dat_run_ids, input_folder=tmp_folder, pgdbs_folder_path=pgdbs_folder_path, - patho_hole_filler=patho_hole_filler, dat_extraction=dat_extraction, output_folder=output_folder, - size_reduction=size_reduction, only_dat_creation=only_dat_creation, taxon_file=taxon_file) + patho_hole_filler=patho_hole_filler, patho_operon_predictor=patho_operon_predictor, + dat_extraction=dat_extraction, output_folder=output_folder, size_reduction=size_reduction, + only_dat_creation=only_dat_creation, taxon_file=taxon_file) # Add species that have data in PGDB but are not present in output folder. # Or if ignore_error has been used, select only PathoLogic build that have succeed + species in input with PGDB and not in output. if input_folder: @@ -163,8 +169,9 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None for run_dat_id in run_dat_ids: create_dat_creation_script(run_dat_id, input_folder + "/" + run_dat_id + "/" + "dat_creation.lisp") multiprocess_dat_inputs = create_mpwt_input(run_ids=run_dat_ids, input_folder=input_folder, pgdbs_folder_path=pgdbs_folder_path, - patho_hole_filler=patho_hole_filler, dat_extraction=dat_extraction, output_folder=output_folder, - size_reduction=size_reduction, only_dat_creation=None, taxon_file=taxon_file) + patho_hole_filler=patho_hole_filler, patho_operon_predictor=patho_operon_predictor, + dat_extraction=dat_extraction, output_folder=output_folder, size_reduction=size_reduction, + only_dat_creation=None, taxon_file=taxon_file) multiprocess_inputs.extend(multiprocess_dat_inputs) # Create BioPAX/attributes-values dat files. @@ -208,7 +215,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None mpwt_pool.join() # Turn on loading of pubmed entries. - if turn_off_citations: + if patho_citations: utils.pubmed_citations(activate_citations=True) end_time = time.time() diff --git a/mpwt/pathologic_input.py b/mpwt/pathologic_input.py index 2ad479f..ccca8e5 100755 --- a/mpwt/pathologic_input.py +++ b/mpwt/pathologic_input.py @@ -426,8 +426,10 @@ def pwt_input_files(multiprocess_input): def create_mpwt_input(run_ids, input_folder, pgdbs_folder_path, - patho_hole_filler=None, dat_extraction=None, output_folder=None, - size_reduction=None, only_dat_creation=None, taxon_file=None): + patho_hole_filler=None, patho_operon_predictor=None, + dat_extraction=None, output_folder=None, + size_reduction=None, only_dat_creation=None, + taxon_file=None): """ Create input list for all multiprocess function, containing one lsit for each input subfolder. All arguments are also stored. @@ -456,6 +458,7 @@ def create_mpwt_input(run_ids, input_folder, pgdbs_folder_path, multiprocess_input['pgdb_folders'] = pgdb_id_folders multiprocess_input['species_input_folder_path'] = input_folder_path multiprocess_input['patho_hole_filler'] = patho_hole_filler + multiprocess_input['patho_operon_predictor'] = patho_operon_predictor multiprocess_input['dat_extraction'] = dat_extraction multiprocess_input['output_folder'] = output_folder multiprocess_input['size_reduction'] = size_reduction diff --git a/mpwt/pwt_wrapper.py b/mpwt/pwt_wrapper.py index 4485df5..89ee818 100755 --- a/mpwt/pwt_wrapper.py +++ b/mpwt/pwt_wrapper.py @@ -65,6 +65,7 @@ def run_pwt(multiprocess_input): """ species_input_folder_path = multiprocess_input['species_input_folder_path'] patho_hole_filler = multiprocess_input['patho_hole_filler'] + patho_operon_predictor = multiprocess_input['patho_operon_predictor'] cmd_options = ['-no-web-cel-overview', '-no-cel-overview', '-no-patch-download', '-disable-metadata-saving', '-nologfile'] @@ -73,6 +74,9 @@ def run_pwt(multiprocess_input): if patho_hole_filler: cmd_pwt.append('-hole-filler') + if patho_operon_predictor: + cmd_pwt.append('-operon-predictor') + logger.info(' '.join(cmd_pwt)) error_status = None From fddf6c361c5b468c7b1940a15fba2a1592dc7c08 Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Mon, 18 Nov 2019 17:53:39 +0100 Subject: [PATCH 04/15] Update Readme with Operon Predictor option (issue #33). Update Readme with no loading of PubMed citations (issue #34). --- README.rst | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index 475c0f3..60d5c27 100755 --- a/README.rst +++ b/README.rst @@ -262,7 +262,7 @@ mpwt can be used with the command line: .. code:: sh - mpwt -f path/to/folder/input [-o path/to/folder/output] [--patho] [--hf] [--dat] [--md] [--cpu INT] [-r] [--clean] [--log path/to/folder/log] [--ignore-error] [-v] + mpwt -f path/to/folder/input [-o path/to/folder/output] [--patho] [--hf] [--op] [--nc] [--dat] [--md] [--cpu INT] [-r] [--clean] [--log path/to/folder/log] [--ignore-error] [-v] Optional argument are identified by []. @@ -279,6 +279,8 @@ mpwt can be used in a python script with an import: output_folder=folder_output, patho_inference=optional_boolean, patho_hole_filler=optional_boolean, + patho_operon_predictor=optional_boolean, + patho_citations=optional_boolean, dat_creation=optional_boolean, dat_extraction=optional_boolean, size_reduction=optional_boolean, @@ -291,13 +293,17 @@ mpwt can be used in a python script with an import: +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ | Command line argument | Python argument | description | +=========================+================================================+=========================================================================+ -| -f | input_folder(string: folder pathname) | input folder as described in Input data | +| -f | input_folder(string: folder pathname) | Input folder as described in Input data | +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ -| -o | output_folder(string: folder pathname) | output folder containing PGDB data or dat files (see --dat arguments) | +| -o | output_folder(string: folder pathname) | Output folder containing PGDB data or dat files (see --dat arguments) | +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ -| --patho | patho_inference(boolean) | launch PathoLogic inference on input folder | +| --patho | patho_inference(boolean) | Launch PathoLogic inference on input folder | +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ -| --hf | patho_hole_filler(boolean) | launch PathoLogic Hole Filler with Blast | +| --hf | patho_hole_filler(boolean) | Launch PathoLogic Hole Filler with Blast | ++-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ +| --op | patho_operon_predictor(boolean) | Launch PathoLogic Operon Predictor | ++-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ +| --nc | patho_citations(boolean) | Launch PathoLogic without loading PubMed citations | +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ | --dat | dat_creation(boolean) | Create BioPAX/attribute-value dat files | +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ @@ -350,13 +356,13 @@ Create PGDBs of studied organisms inside ptools-local: mpwt.multiprocess_pwt(input_folder='path/to/folder/input', patho_inference=True) -Create PGDBs of studied organisms inside ptools-local with the Hole-Filler: +Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon Predictor and without loading PubMed citations: .. .. code:: sh - mpwt -f path/to/folder/input --patho --hf --log path/to/folder/log + mpwt -f path/to/folder/input --patho --hf --op --nc --log path/to/folder/log .. code:: python @@ -364,6 +370,8 @@ Create PGDBs of studied organisms inside ptools-local with the Hole-Filler: mpwt.multiprocess_pwt(input_folder='path/to/folder/input', patho_inference=True, patho_hole_filler=True, + patho_operon_predictor=True, + patho_citations=True, patho_log='path/to/folder/log') Create PGDBs of studied organisms inside ptools-local and create dat files: From 4287fd0c0bd0c0950dfa9b466364201638e07466 Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Mon, 18 Nov 2019 18:12:05 +0100 Subject: [PATCH 05/15] Fix issue with result files from Hole Filler and Operon Predictor (issue #33). --- mpwt/pwt_wrapper.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mpwt/pwt_wrapper.py b/mpwt/pwt_wrapper.py index 89ee818..f9f9467 100755 --- a/mpwt/pwt_wrapper.py +++ b/mpwt/pwt_wrapper.py @@ -194,7 +194,10 @@ def run_move_pgdb(move_data): for pgdb_file in os.listdir(pgdb_tmp_folder_path): pgdb_file_pathname = pgdb_tmp_folder_path + '/' + pgdb_file if '.dat' not in pgdb_file: - os.remove(pgdb_file_pathname) + if os.path.isfile(pgdb_file): + os.remove(pgdb_file_pathname) + elif os.path.isdir(pgdb_file): + shutil.rmtree(pgdb_file_pathname) shutil.make_archive(output_folder + '/' + pgdb_folder_dbname, 'zip', pgdb_tmp_folder_path) shutil.rmtree(pgdb_folder_path) else: @@ -202,4 +205,7 @@ def run_move_pgdb(move_data): if dat_extraction: for pgdb_file in os.listdir(output_species): if '.dat' not in pgdb_file: - os.remove(output_species+'/'+pgdb_file) + if os.path.isfile(output_species+'/'+pgdb_file): + os.remove(output_species+'/'+pgdb_file) + elif os.path.isdir(output_species+'/'+pgdb_file): + shutil.rmtree(output_species+'/'+pgdb_file) From a7c944e2aa73d50d244de23c6ce20156f8057772 Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Wed, 20 Nov 2019 15:04:33 +0100 Subject: [PATCH 06/15] Update function comments with new arguments. --- mpwt/mpwt_workflow.py | 12 +++++++----- mpwt/pathologic_input.py | 1 + mpwt/utils.py | 19 +++++++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/mpwt/mpwt_workflow.py b/mpwt/mpwt_workflow.py index d5a6996..0192073 100755 --- a/mpwt/mpwt_workflow.py +++ b/mpwt/mpwt_workflow.py @@ -34,11 +34,13 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None Args: input_folder (str): pathname to input folder output_folder (str): pathname to output folder - patho_inference (bool): pathologic boolean (True/False) - patho_hole_filler (bool): pathologic hole filler boolean (True/False) - dat_creation (bool): BioPAX/attributes-values files creation boolean (True/False) - dat_extraction (bool): BioPAX/attributes-values files extraction boolean (True/False) - size_reduction (bool): Delete ptools-local data at the end boolean (True/False) + patho_inference (bool): PathoLogic inference (True/False) + patho_hole_filler (bool): PathoLogic hole filler (True/False) + patho_operon_predictor (bool): PathoLogic operon predictor (True/False) + patho_citations (bool): turning off loading of PubMed citations (True/False) + dat_creation (bool): BioPAX/attributes-values files creation (True/False) + dat_extraction (bool): BioPAX/attributes-values files extraction (True/False) + size_reduction (bool): delete ptools-local data at the end (True/False) number_cpu (int): number of CPU used (default=1) patho_log (str): pathname to mpwt log folder verbose (bool): verbose argument diff --git a/mpwt/pathologic_input.py b/mpwt/pathologic_input.py index ccca8e5..305a757 100755 --- a/mpwt/pathologic_input.py +++ b/mpwt/pathologic_input.py @@ -439,6 +439,7 @@ def create_mpwt_input(run_ids, input_folder, pgdbs_folder_path, input_folder (str): pathname to input folder pgdbs_folder_path (str): pathname to species PGDB in ptools-local patho_hole_filler (bool): PathoLogic Hole Filler argument + patho_operon_predictor (bool): PathoLogic Operon predictor argument dat_extraction (bool): BioPAX/attribute-values file extraction argument output_folder (str): pathname to output folder size_reduction (bool): ptools-local PGDB deletion after processing argument diff --git a/mpwt/utils.py b/mpwt/utils.py index d42ae70..66522f9 100755 --- a/mpwt/utils.py +++ b/mpwt/utils.py @@ -168,6 +168,14 @@ def permission_change(folder_pathname): def create_pathologic_file(input_folder, output_folder, number_cpu=None): + """ + Create PathoLogic file from Genbank or GFF files. + + Args: + input_folder (str): pathname to the folder containing Genbanks or GFFs + output_folder (str): pathname to the output folder containing the PathoLogic files + number_cpu (str): number of CPU + """ if number_cpu: number_cpu_to_use = int(number_cpu) else: @@ -199,6 +207,12 @@ def create_pathologic_file(input_folder, output_folder, number_cpu=None): def run_create_pathologic_file(multiprocessing_input_data): + """ + Create PathoLogic files from a Genbank or a GFF file. + + Args: + multiprocess_input (dictionary): contains multiprocess input (input folder, output_path, output folder and input_name) + """ input_path = multiprocessing_input_data['input_path'] output_folder = multiprocessing_input_data['output_folder'] output_path = multiprocessing_input_data['output_path'] @@ -332,6 +346,11 @@ def run_create_pathologic_file(multiprocessing_input_data): def pubmed_citations(activate_citations): + """ + Activate or deactivate loading of PubMed citations. + + TODO: update this function with the argument from the new version of Pathway Tools + """ ptools_init_filepath = find_ptools_path() + '/ptools-init.dat' new_ptools_file = "" with open(ptools_init_filepath, 'r') as ptools_init_file: From 24140ee7d5f4d5dc0343b20e89df93f4c2ef12df Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Fri, 22 Nov 2019 16:42:48 +0100 Subject: [PATCH 07/15] Fix issue with topf and PF as input (issue #30). Add test for topf (issue #30). --- mpwt/__init__.py | 2 +- mpwt/__main__.py | 2 +- mpwt/utils.py | 77 +++++++++++++------ .../fatty_acid_beta_oxydation_I_gff.gff | 32 ++++---- ...pwt_test.py => test_mpwt_pathway_tools.py} | 5 +- 5 files changed, 76 insertions(+), 42 deletions(-) rename test/{mpwt_test.py => test_mpwt_pathway_tools.py} (93%) diff --git a/mpwt/__init__.py b/mpwt/__init__.py index 08e2354..85d1f35 100755 --- a/mpwt/__init__.py +++ b/mpwt/__init__.py @@ -1,3 +1,3 @@ from mpwt.pwt_wrapper import run_pwt, run_pwt_dat from mpwt.mpwt_workflow import multiprocess_pwt -from mpwt.utils import cleaning, cleaning_input, find_ptools_path, list_pgdb, pubmed_citations, remove_pgdbs +from mpwt.utils import cleaning, cleaning_input, create_pathologic_file, find_ptools_path, list_pgdb, pubmed_citations, remove_pgdbs diff --git a/mpwt/__main__.py b/mpwt/__main__.py index 6ef12aa..bcd3728 100755 --- a/mpwt/__main__.py +++ b/mpwt/__main__.py @@ -34,7 +34,7 @@ --ignore-error Ignore errors (PathoLogic and dat creation) and continue for successful builds. --taxon-file For the use of the taxon_id.tsv file to find the taxon ID. -v Verbose. - topf Will convert Genbank file into PathoLogic Format file. + topf Will convert Genbank and/or GFF files into PathoLogic Format file. """ diff --git a/mpwt/utils.py b/mpwt/utils.py index 66522f9..7e4b682 100755 --- a/mpwt/utils.py +++ b/mpwt/utils.py @@ -185,20 +185,41 @@ def create_pathologic_file(input_folder, output_folder, number_cpu=None): mpwt_pool = Pool(processes=number_cpu_to_use) - for input_name in os.listdir(input_folder): + input_names = os.listdir(input_folder) + + if 'taxon_id.tsv' in input_names: + taxon_ids = {} + input_names.remove('taxon_id.tsv') + with open(input_folder + '/taxon_id.tsv') as taxon_file: + for row in csv.reader(taxon_file, delimiter='\t'): + taxon_ids[row[0]] = row[1] + else: + taxon_ids = None + + for input_name in input_names: input_path_gbk = input_folder + '/' + input_name + '/' + input_name + '.gbk' input_path_gff = input_folder + '/' + input_name + '/' + input_name + '.gff' if os.path.exists(input_path_gbk): input_path = input_path_gbk elif os.path.exists(input_path_gff): input_path = input_path_gff + elif all([True for species_file in os.listdir(input_folder + '/' + input_name + '/') if '.pf' in species_file or '.fasta' in species_file]): + input_path = input_folder + '/' + input_name + '/' else: sys.exit('No .gff or .gbk file in ' + input_folder + '/' + input_name) + output_path = output_folder + '/' + input_name - if not os.path.exists(output_path): - os.makedirs(output_path) - multiprocessing_input_data.append({'input_path': input_path, 'output_path': output_path, - 'output_folder': output_folder, 'input_name': input_name}) + + if not os.path.exists(output_folder): + os.makedirs(output_folder) + + multiprocessing_dict = {'input_path': input_path, 'output_path': output_path, + 'output_folder': output_folder, 'input_name': input_name} + if taxon_ids: + if input_name in taxon_ids: + multiprocessing_dict['taxon_id'] = taxon_ids[input_name] + + multiprocessing_input_data.append(multiprocessing_dict) mpwt_pool.map(run_create_pathologic_file, multiprocessing_input_data) @@ -206,6 +227,18 @@ def create_pathologic_file(input_folder, output_folder, number_cpu=None): mpwt_pool.join() +def write_taxon_id_file(input_name, taxon_id, output_folder): + if not os.path.exists(output_folder + '/taxon_id.tsv'): + with open(output_folder + '/taxon_id.tsv', 'w') as taxon_id_file: + taxon_writer = csv.writer(taxon_id_file, delimiter='\t') + taxon_writer.writerow(['species', 'taxon_id']) + taxon_writer.writerow([input_name, taxon_id]) + else: + with open(output_folder + '/taxon_id.tsv', 'a') as taxon_id_file: + taxon_writer = csv.writer(taxon_id_file, delimiter='\t') + taxon_writer.writerow([input_name, taxon_id]) + + def run_create_pathologic_file(multiprocessing_input_data): """ Create PathoLogic files from a Genbank or a GFF file. @@ -220,6 +253,10 @@ def run_create_pathologic_file(multiprocessing_input_data): taxon_id = None # Add taxon ID in taxon_id.tsv if available. if input_path.endswith('.gbk'): + + if not os.path.exists(output_path): + os.makedirs(output_path) + with open(input_path, "r") as gbk: first_seq_record = next(SeqIO.parse(gbk, "genbank")) src_features = [feature for feature in first_seq_record.features if feature.type == "source"] @@ -232,15 +269,7 @@ def run_create_pathologic_file(multiprocessing_input_data): except KeyError: logger.info('No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'.format(input_path)) if taxon_id: - if not os.path.exists(output_folder + '/taxon_id.tsv'): - with open(output_folder + '/taxon_id.tsv', 'w') as taxon_id_file: - taxon_writer = csv.writer(taxon_id_file, delimiter='\t') - taxon_writer.writerow(['species', 'taxon_id']) - taxon_writer.writerow([input_name, taxon_id]) - else: - with open(output_folder + '/taxon_id.tsv', 'a') as taxon_id_file: - taxon_writer = csv.writer(taxon_id_file, delimiter='\t') - taxon_writer.writerow([input_name, taxon_id]) + write_taxon_id_file(input_name, taxon_id, output_folder) for record in SeqIO.parse(input_path, 'genbank'): element_id = record.id @@ -297,6 +326,10 @@ def run_create_pathologic_file(multiprocessing_input_data): element_file.write('//\n\n') elif input_path.endswith('.gff'): + + if not os.path.exists(output_path): + os.makedirs(output_path) + gff_database = gffutils.create_db(input_path, ':memory:', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True) regions = list(set([region.chrom for region in gff_database.features_of_type('region')])) try: @@ -308,15 +341,8 @@ def run_create_pathologic_file(multiprocessing_input_data): if 'taxon' in dbxref: taxon_id = dbxref.replace('taxon:', '') if taxon_id: - if not os.path.exists(output_folder + '/taxon_id.tsv'): - with open(output_folder + '/taxon_id.tsv', 'w') as taxon_id_file: - taxon_writer = csv.writer(taxon_id_file, delimiter='\t') - taxon_writer.writerow(['species', 'taxon_id']) - taxon_writer.writerow([input_name, taxon_id]) - else: - with open(output_folder + '/taxon_id.tsv', 'a') as taxon_id_file: - taxon_writer = csv.writer(taxon_id_file, delimiter='\t') - taxon_writer.writerow([input_name, taxon_id]) + write_taxon_id_file(input_name, taxon_id, output_folder) + for record in SeqIO.parse(input_path.replace('.gff', '.fasta'), 'fasta'): output_fasta = output_path + '/' + record.id + '.fasta' SeqIO.write(record, output_fasta, 'fasta') @@ -344,6 +370,11 @@ def run_create_pathologic_file(multiprocessing_input_data): element_file.write('//\n\n') + elif all([True for species_file in os.listdir(input_path) if '.pf' in species_file or '.fasta' in species_file]): + taxon_id = multiprocessing_input_data['taxon_id'] + write_taxon_id_file(input_name, taxon_id, output_folder) + shutil.copytree(input_path, output_path) + def pubmed_citations(activate_citations): """ diff --git a/test/test/fatty_acid_beta_oxydation_I_gff/fatty_acid_beta_oxydation_I_gff.gff b/test/test/fatty_acid_beta_oxydation_I_gff/fatty_acid_beta_oxydation_I_gff.gff index 03de3a4..6c6bc6a 100755 --- a/test/test/fatty_acid_beta_oxydation_I_gff/fatty_acid_beta_oxydation_I_gff.gff +++ b/test/test/fatty_acid_beta_oxydation_I_gff/fatty_acid_beta_oxydation_I_gff.gff @@ -3,20 +3,20 @@ #!processor NCBI annotwriter #!genome-build ASM584v2 #!genome-build-accession NCBI_Assembly:GCF_000005845.2 -##sequence-region NC_000913.3 1 12642 +##sequence-region NC_000913_3 1 12642 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=511145 -NC_000913.3 RefSeq region 1 12642 . + . ID=id0;Dbxref=taxon:511145;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;strain=K-12;substrain=MG1655 -NC_000913.3 RefSeq gene 1 2445 . - . ID=gene226;Dbxref=ASAP:ABE-0000743,ECOCYC:G6105,EcoGene:EG13145,GeneID:949007;Name=fadE;gbkey=Gene;gene=fadE;gene_biotype=protein_coding;gene_synonym=ECK0222,yafH;locus_tag=b0221 -NC_000913.3 RefSeq CDS 1 2445 . - 0 ID=cds216;Parent=gene226;Dbxref=UniProtKB/Swiss-Prot:Q47146,Genbank:NP_414756.2,ASAP:ABE-0000743,ECOCYC:G6105,EcoGene:EG13145,GeneID:949007;Name=NP_414756.2;gbkey=CDS;gene=fadE;orig_transcript_id=gnl|b0221|mrna.b0221;product=acyl-CoA dehydrogenase;protein_id=NP_414756.2;transl_table=11 -NC_000913.3 RefSeq gene 2446 4146 . + . ID=gene1781;Dbxref=ASAP:ABE-0005676,ECOCYC:EG12357,EcoGene:EG12357,GeneID:946213;Name=fadK;gbkey=Gene;gene=fadK;gene_biotype=protein_coding;gene_synonym=ECK1699,ydiD;locus_tag=b1701 -NC_000913.3 RefSeq CDS 2446 4146 . + 0 ID=cds1721;Parent=gene1781;Dbxref=UniProtKB/Swiss-Prot:P38135,Genbank:NP_416216.5,ASAP:ABE-0005676,ECOCYC:EG12357,EcoGene:EG12357,GeneID:946213;Name=NP_416216.5;gbkey=CDS;gene=fadK;orig_transcript_id=gnl|b1701|mrna.b1701;product=short chain acyl-CoA synthetase;protein_id=NP_416216.5;transl_table=11 -NC_000913.3 RefSeq gene 4147 5832 . - . ID=gene1887;Dbxref=ASAP:ABE-0006005,ECOCYC:EG11530,EcoGene:EG11530,GeneID:946327;Name=fadD;gbkey=Gene;gene=fadD;gene_biotype=protein_coding;gene_synonym=ECK1803,oldD;locus_tag=b1805 -NC_000913.3 RefSeq CDS 4147 5832 . - 0 ID=cds1827;Parent=gene1887;Dbxref=UniProtKB/Swiss-Prot:P69451,Genbank:NP_416319.1,ASAP:ABE-0006005,ECOCYC:EG11530,EcoGene:EG11530,GeneID:946327;Name=NP_416319.1;gbkey=CDS;gene=fadD;orig_transcript_id=gnl|b1805|mrna.b1805;product=fatty acyl-CoA synthetase;protein_id=NP_416319.1;transl_table=11 -NC_000913.3 RefSeq gene 5833 7977 . - . ID=gene2441;Dbxref=ASAP:ABE-0007723,ECOCYC:G7212,EcoGene:EG14127,GeneID:949097;Name=fadJ;gbkey=Gene;gene=fadJ;gene_biotype=protein_coding;gene_synonym=ECK2335,yfcX;locus_tag=b2341 -NC_000913.3 RefSeq CDS 5833 7977 . - 0 ID=cds2356;Parent=gene2441;Dbxref=UniProtKB/Swiss-Prot:P77399,Genbank:NP_416843.1,ASAP:ABE-0007723,ECOCYC:G7212,EcoGene:EG14127,GeneID:949097;Name=NP_416843.1;gbkey=CDS;gene=fadJ;orig_transcript_id=gnl|b2341|mrna.b2341;product=3-hydroxyacyl-CoA dehydrogenase FadJ;protein_id=NP_416843.1;transl_table=11 -NC_000913.3 RefSeq gene 7978 9288 . - . ID=gene2442;Dbxref=ASAP:ABE-0007725,ECOCYC:G7213,EcoGene:EG14128,GeneID:948823;Name=fadI;gbkey=Gene;gene=fadI;gene_biotype=protein_coding;gene_synonym=ECK2336,yfcY;locus_tag=b2342 -NC_000913.3 RefSeq CDS 7978 9288 . - 0 ID=cds2357;Parent=gene2442;Dbxref=UniProtKB/Swiss-Prot:P76503,Genbank:NP_416844.1,ASAP:ABE-0007725,ECOCYC:G7213,EcoGene:EG14128,GeneID:948823;Name=NP_416844.1;gbkey=CDS;gene=fadI;orig_transcript_id=gnl|b2342|mrna.b2342;product=3-ketoacyl-CoA thiolase FadI;protein_id=NP_416844.1;transl_table=11 -NC_000913.3 RefSeq gene 9289 10452 . - . ID=gene3987;Dbxref=ASAP:ABE-0012562,ECOCYC:EG10278,EcoGene:EG10278,GeneID:948324;Name=fadA;gbkey=Gene;gene=fadA;gene_biotype=protein_coding;gene_synonym=ECK3837,oldA;locus_tag=b3845 -NC_000913.3 RefSeq CDS 9289 10452 . - 0 ID=cds3816;Parent=gene3987;Dbxref=UniProtKB/Swiss-Prot:P21151,Genbank:YP_026272.1,ASAP:ABE-0012562,ECOCYC:EG10278,EcoGene:EG10278,GeneID:948324;Name=YP_026272.1;gbkey=CDS;gene=fadA;orig_transcript_id=gnl|b3845|mrna.b3845;product=3-ketoacyl-CoA thiolase;protein_id=YP_026272.1;transl_table=11 -NC_000913.3 RefSeq gene 10453 12642 . - . ID=gene3988;Dbxref=ASAP:ABE-0012564,ECOCYC:EG10279,EcoGene:EG10279,GeneID:948336;Name=fadB;gbkey=Gene;gene=fadB;gene_biotype=protein_coding;gene_synonym=ECK3838,oldB;locus_tag=b3846 -NC_000913.3 RefSeq CDS 10453 12642 . - 0 ID=cds3817;Parent=gene3988;Dbxref=UniProtKB/Swiss-Prot:P21177,Genbank:NP_418288.1,ASAP:ABE-0012564,ECOCYC:EG10279,EcoGene:EG10279,GeneID:948336;Name=NP_418288.1;gbkey=CDS;gene=fadB;orig_transcript_id=gnl|b3846|mrna.b3846;product=dodecenoyl-CoA delta-isomerase%2C enoyl-CoA hydratase%2C 3-hydroxybutyryl-CoA epimerase%2C 3-hydroxyacyl-CoA dehydrogenase;protein_id=NP_418288.1;transl_table=11 +NC_000913_3 RefSeq region 1 12642 . + . ID=id0;Dbxref=taxon:511145;Is_circular=true;Name=ANONYMOUS;gbkey=Src;genome=chromosome;mol_type=genomic DNA;strain=K-12;substrain=MG1655 +NC_000913_3 RefSeq gene 1 2445 . - . ID=gene226;Dbxref=ASAP:ABE-0000743,ECOCYC:G6105,EcoGene:EG13145,GeneID:949007;Name=fadE;gbkey=Gene;gene=fadE;gene_biotype=protein_coding;gene_synonym=ECK0222,yafH;locus_tag=b0221 +NC_000913_3 RefSeq CDS 1 2445 . - 0 ID=cds216;Parent=gene226;Dbxref=UniProtKB/Swiss-Prot:Q47146,Genbank:NP_414756.2,ASAP:ABE-0000743,ECOCYC:G6105,EcoGene:EG13145,GeneID:949007;Name=NP_414756.2;gbkey=CDS;gene=fadE;orig_transcript_id=gnl|b0221|mrna.b0221;product=acyl-CoA dehydrogenase;protein_id=NP_414756.2;transl_table=11 +NC_000913_3 RefSeq gene 2446 4146 . + . ID=gene1781;Dbxref=ASAP:ABE-0005676,ECOCYC:EG12357,EcoGene:EG12357,GeneID:946213;Name=fadK;gbkey=Gene;gene=fadK;gene_biotype=protein_coding;gene_synonym=ECK1699,ydiD;locus_tag=b1701 +NC_000913_3 RefSeq CDS 2446 4146 . + 0 ID=cds1721;Parent=gene1781;Dbxref=UniProtKB/Swiss-Prot:P38135,Genbank:NP_416216.5,ASAP:ABE-0005676,ECOCYC:EG12357,EcoGene:EG12357,GeneID:946213;Name=NP_416216.5;gbkey=CDS;gene=fadK;orig_transcript_id=gnl|b1701|mrna.b1701;product=short chain acyl-CoA synthetase;protein_id=NP_416216.5;transl_table=11 +NC_000913_3 RefSeq gene 4147 5832 . - . ID=gene1887;Dbxref=ASAP:ABE-0006005,ECOCYC:EG11530,EcoGene:EG11530,GeneID:946327;Name=fadD;gbkey=Gene;gene=fadD;gene_biotype=protein_coding;gene_synonym=ECK1803,oldD;locus_tag=b1805 +NC_000913_3 RefSeq CDS 4147 5832 . - 0 ID=cds1827;Parent=gene1887;Dbxref=UniProtKB/Swiss-Prot:P69451,Genbank:NP_416319.1,ASAP:ABE-0006005,ECOCYC:EG11530,EcoGene:EG11530,GeneID:946327;Name=NP_416319.1;gbkey=CDS;gene=fadD;orig_transcript_id=gnl|b1805|mrna.b1805;product=fatty acyl-CoA synthetase;protein_id=NP_416319.1;transl_table=11 +NC_000913_3 RefSeq gene 5833 7977 . - . ID=gene2441;Dbxref=ASAP:ABE-0007723,ECOCYC:G7212,EcoGene:EG14127,GeneID:949097;Name=fadJ;gbkey=Gene;gene=fadJ;gene_biotype=protein_coding;gene_synonym=ECK2335,yfcX;locus_tag=b2341 +NC_000913_3 RefSeq CDS 5833 7977 . - 0 ID=cds2356;Parent=gene2441;Dbxref=UniProtKB/Swiss-Prot:P77399,Genbank:NP_416843.1,ASAP:ABE-0007723,ECOCYC:G7212,EcoGene:EG14127,GeneID:949097;Name=NP_416843.1;gbkey=CDS;gene=fadJ;orig_transcript_id=gnl|b2341|mrna.b2341;product=3-hydroxyacyl-CoA dehydrogenase FadJ;protein_id=NP_416843.1;transl_table=11 +NC_000913_3 RefSeq gene 7978 9288 . - . ID=gene2442;Dbxref=ASAP:ABE-0007725,ECOCYC:G7213,EcoGene:EG14128,GeneID:948823;Name=fadI;gbkey=Gene;gene=fadI;gene_biotype=protein_coding;gene_synonym=ECK2336,yfcY;locus_tag=b2342 +NC_000913_3 RefSeq CDS 7978 9288 . - 0 ID=cds2357;Parent=gene2442;Dbxref=UniProtKB/Swiss-Prot:P76503,Genbank:NP_416844.1,ASAP:ABE-0007725,ECOCYC:G7213,EcoGene:EG14128,GeneID:948823;Name=NP_416844.1;gbkey=CDS;gene=fadI;orig_transcript_id=gnl|b2342|mrna.b2342;product=3-ketoacyl-CoA thiolase FadI;protein_id=NP_416844.1;transl_table=11 +NC_000913_3 RefSeq gene 9289 10452 . - . ID=gene3987;Dbxref=ASAP:ABE-0012562,ECOCYC:EG10278,EcoGene:EG10278,GeneID:948324;Name=fadA;gbkey=Gene;gene=fadA;gene_biotype=protein_coding;gene_synonym=ECK3837,oldA;locus_tag=b3845 +NC_000913_3 RefSeq CDS 9289 10452 . - 0 ID=cds3816;Parent=gene3987;Dbxref=UniProtKB/Swiss-Prot:P21151,Genbank:YP_026272.1,ASAP:ABE-0012562,ECOCYC:EG10278,EcoGene:EG10278,GeneID:948324;Name=YP_026272.1;gbkey=CDS;gene=fadA;orig_transcript_id=gnl|b3845|mrna.b3845;product=3-ketoacyl-CoA thiolase;protein_id=YP_026272.1;transl_table=11 +NC_000913_3 RefSeq gene 10453 12642 . - . ID=gene3988;Dbxref=ASAP:ABE-0012564,ECOCYC:EG10279,EcoGene:EG10279,GeneID:948336;Name=fadB;gbkey=Gene;gene=fadB;gene_biotype=protein_coding;gene_synonym=ECK3838,oldB;locus_tag=b3846 +NC_000913_3 RefSeq CDS 10453 12642 . - 0 ID=cds3817;Parent=gene3988;Dbxref=UniProtKB/Swiss-Prot:P21177,Genbank:NP_418288.1,ASAP:ABE-0012564,ECOCYC:EG10279,EcoGene:EG10279,GeneID:948336;Name=NP_418288.1;gbkey=CDS;gene=fadB;orig_transcript_id=gnl|b3846|mrna.b3846;product=dodecenoyl-CoA delta-isomerase%2C enoyl-CoA hydratase%2C 3-hydroxybutyryl-CoA epimerase%2C 3-hydroxyacyl-CoA dehydrogenase;protein_id=NP_418288.1;transl_table=11 diff --git a/test/mpwt_test.py b/test/test_mpwt_pathway_tools.py similarity index 93% rename from test/mpwt_test.py rename to test/test_mpwt_pathway_tools.py index 845a5cf..234d2de 100755 --- a/test/mpwt_test.py +++ b/test/test_mpwt_pathway_tools.py @@ -36,7 +36,9 @@ def test_multiprocess_pwt_import(): """ mpwt.remove_pgdbs('fatty_acid_beta_oxydation_icyc,fatty_acid_beta_oxydation_i_gffcyc,fatty_acid_beta_oxydation_i_pfcyc') mpwt.cleaning_input('test') - mpwt.multiprocess_pwt('test', 'test_output', patho_inference=True, dat_creation=True, dat_extraction=True, size_reduction=False, verbose=True) + + mpwt.create_pathologic_file('test', 'test_pf') + mpwt.multiprocess_pwt('test_pf', 'test_output', patho_inference=True, dat_creation=True, dat_extraction=True, size_reduction=False, verbose=True) pathway_fabo_pathname = "test_output/fatty_acid_beta_oxydation_I_gff/pathways.dat" expected_tca_reactions = reaction_extraction(pathway_fabo_pathname) @@ -51,6 +53,7 @@ def test_multiprocess_pwt_import(): assert set(fabo_reactions()).issubset(set(expected_pf_fabo_reactions)) mpwt.cleaning_input('test') + shutil.rmtree('test_pf') shutil.rmtree('test_output') shutil.rmtree('__pycache__') From 1f0bbd46bb86c5f56228ab87a1f426b5fa431307 Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Fri, 22 Nov 2019 16:44:01 +0100 Subject: [PATCH 08/15] Add topf in Readme (issue #30). --- README.rst | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/README.rst b/README.rst index 60d5c27..f9c2b53 100755 --- a/README.rst +++ b/README.rst @@ -326,6 +326,21 @@ mpwt can be used in a python script with an import: | -v | verbose(boolean) | Print some information about the processing of mpwt | +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ +There is also another argument: + +.. code:: sh + + mpwt topf -f input_folder -o output_folder -c cpu_number + +.. code:: python + + import mpwt + mpwt.create_pathologic_file(input_folder, output_folder, cpu_number) + +This argument reads the input data inside the input folder. Then it converts Genbank and GFF files into PathoLogic Format files. And if there is already PathoLogic files it copies them. + +It can be used to avoid issues with parsing Genbank and GFF files. But it is an early Work in Progress. + Examples ~~~~~~~~ @@ -356,6 +371,21 @@ Create PGDBs of studied organisms inside ptools-local: mpwt.multiprocess_pwt(input_folder='path/to/folder/input', patho_inference=True) +Convert Genbank and GFF files into PathoLogic files then create PGDBs of studied organisms inside ptools-local: + +.. + + .. code:: sh + + mpwt topf -f path/to/folder/input -o path/to/folder/pf + mpwt -f path/to/folder/pf --patho + + .. code:: python + + import mpwt + mpwt.create_pathologic_file(input_folder='path/to/folder/input', output_folder='path/to/folder/pf') + mpwt.multiprocess_pwt(input_folder='path/to/folder/pf', patho_inference=True) + Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon Predictor and without loading PubMed citations: .. From 12c94ce988352a61e4c066582922a3c1cb2efe8f Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Tue, 26 Nov 2019 13:15:12 +0100 Subject: [PATCH 09/15] Fix issue with PathoLogic file creation when using GFF (issue #30). Fix link in Readme. --- README.rst | 2 +- mpwt/utils.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index f9c2b53..8c8e8f8 100755 --- a/README.rst +++ b/README.rst @@ -151,7 +151,7 @@ PF file example: INTRON START1-STOP1 // -Look at the `Pathologic format `__ for more informations. +Look at the `Pathologic format `__ for more informations. You have to provide one nucleotide sequence for each pathologic containing one scaffold/contig. diff --git a/mpwt/utils.py b/mpwt/utils.py index 7e4b682..864cec2 100755 --- a/mpwt/utils.py +++ b/mpwt/utils.py @@ -367,8 +367,7 @@ def run_create_pathologic_file(multiprocessing_input_data): if 'ec_number' in child.attributes: for ec in child.attributes['ec_number']: element_file.write('EC\t' + ec + '\n') - - element_file.write('//\n\n') + element_file.write('//\n\n') elif all([True for species_file in os.listdir(input_path) if '.pf' in species_file or '.fasta' in species_file]): taxon_id = multiprocessing_input_data['taxon_id'] From d7154f2c08f48f788fd8dd2567740f4d743b66b6 Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Tue, 26 Nov 2019 14:57:00 +0100 Subject: [PATCH 10/15] Fix issue with GO terms in topf (issue #30). --- mpwt/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mpwt/utils.py b/mpwt/utils.py index 864cec2..3c8b5c1 100755 --- a/mpwt/utils.py +++ b/mpwt/utils.py @@ -312,10 +312,10 @@ def run_create_pathologic_file(multiprocessing_input_data): for go in feature.qualifiers['go_component']: element_file.write('GO\t' + go + '\n') if 'go_function' in feature.qualifiers: - for go in feature.qualifiers['go_component']: + for go in feature.qualifiers['go_function']: element_file.write('GO\t' + go + '\n') if 'go_process' in feature.qualifiers: - for go in feature.qualifiers['go_component']: + for go in feature.qualifiers['go_process']: element_file.write('GO\t' + go + '\n') element_file.write('PRODUCT-TYPE\tP' + '\n') if gene_id: From 2d9326746a4f75424a990422b8050ec9c98098f4 Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Mon, 6 Jan 2020 16:48:23 +0100 Subject: [PATCH 11/15] Add option to create taxon_file.tsv (issue #35). Update function modifying ptools-init.dat (issue #34). --- README.rst | 8 +++---- mpwt/__main__.py | 4 ++-- mpwt/mpwt_workflow.py | 36 ++++++++++++++++++++--------- mpwt/pathologic_input.py | 49 ++++++++++++++++++++++++++++++++++++++++ mpwt/results_check.py | 2 +- mpwt/utils.py | 9 ++++---- 6 files changed, 86 insertions(+), 22 deletions(-) diff --git a/README.rst b/README.rst index 8c8e8f8..f4a5f2b 100755 --- a/README.rst +++ b/README.rst @@ -280,7 +280,7 @@ mpwt can be used in a python script with an import: patho_inference=optional_boolean, patho_hole_filler=optional_boolean, patho_operon_predictor=optional_boolean, - patho_citations=optional_boolean, + no_download_articles=optional_boolean, dat_creation=optional_boolean, dat_extraction=optional_boolean, size_reduction=optional_boolean, @@ -303,7 +303,7 @@ mpwt can be used in a python script with an import: +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ | --op | patho_operon_predictor(boolean) | Launch PathoLogic Operon Predictor | +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ -| --nc | patho_citations(boolean) | Launch PathoLogic without loading PubMed citations | +| --nc | no_download_articles(boolean) | Launch PathoLogic without loading PubMed citations | +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ | --dat | dat_creation(boolean) | Create BioPAX/attribute-value dat files | +-------------------------+------------------------------------------------+-------------------------------------------------------------------------+ @@ -386,7 +386,7 @@ Convert Genbank and GFF files into PathoLogic files then create PGDBs of studied mpwt.create_pathologic_file(input_folder='path/to/folder/input', output_folder='path/to/folder/pf') mpwt.multiprocess_pwt(input_folder='path/to/folder/pf', patho_inference=True) -Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon Predictor and without loading PubMed citations: +Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon Predictor and without loading PubMed citations (need Pathway Tools 23.5 or higher): .. @@ -401,7 +401,7 @@ Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon P patho_inference=True, patho_hole_filler=True, patho_operon_predictor=True, - patho_citations=True, + no_download_articles=True, patho_log='path/to/folder/log') Create PGDBs of studied organisms inside ptools-local and create dat files: diff --git a/mpwt/__main__.py b/mpwt/__main__.py index bcd3728..44f4187 100755 --- a/mpwt/__main__.py +++ b/mpwt/__main__.py @@ -63,7 +63,7 @@ def run_mpwt(): patho_inference = args['--patho'] patho_hole_filler = args['--hf'] patho_operon_predictor = args['--op'] - patho_citations = args['--nc'] + no_download_articles = args['--nc'] dat_creation = args['--dat'] move_dat = args['--md'] size_reduction = args['-r'] @@ -116,7 +116,7 @@ def run_mpwt(): patho_inference=patho_inference, patho_hole_filler=patho_hole_filler, patho_operon_predictor=patho_operon_predictor, - patho_citations=patho_citations, + no_download_articles=no_download_articles, dat_creation=dat_creation, dat_extraction=move_dat, size_reduction=size_reduction, diff --git a/mpwt/mpwt_workflow.py b/mpwt/mpwt_workflow.py index 0192073..d25e886 100755 --- a/mpwt/mpwt_workflow.py +++ b/mpwt/mpwt_workflow.py @@ -5,6 +5,7 @@ -check the results (results_check) """ +import csv import logging import os import shutil @@ -14,7 +15,7 @@ from mpwt import utils from mpwt.pwt_wrapper import run_pwt, run_pwt_dat, run_move_pgdb from mpwt.results_check import check_dat, check_pwt, permission_change -from mpwt.pathologic_input import check_input_and_existing_pgdb, create_mpwt_input, pwt_input_files, create_only_dat_lisp, create_dat_creation_script +from mpwt.pathologic_input import check_input_and_existing_pgdb, create_mpwt_input, pwt_input_files, create_only_dat_lisp, create_dat_creation_script, read_taxon_id from multiprocessing import Pool logging.basicConfig(format='%(message)s', level=logging.CRITICAL) @@ -23,10 +24,10 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None, - patho_hole_filler=None, patho_operon_predictor=None, patho_citations=None, + patho_hole_filler=None, patho_operon_predictor=None, no_download_articles=None, dat_creation=None, dat_extraction=None, size_reduction=None, number_cpu=None, patho_log=None, ignore_error=None, - taxon_file=None, turn_off_citations=None, verbose=None): + taxon_file=None, verbose=None): """ Function managing all the workflow (from the creatin of the input files to the results). Use it when you import mpwt in a script. @@ -37,7 +38,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None patho_inference (bool): PathoLogic inference (True/False) patho_hole_filler (bool): PathoLogic hole filler (True/False) patho_operon_predictor (bool): PathoLogic operon predictor (True/False) - patho_citations (bool): turning off loading of PubMed citations (True/False) + no_download_articles (bool): turning off loading of PubMed citations (True/False) dat_creation (bool): BioPAX/attributes-values files creation (True/False) dat_extraction (bool): BioPAX/attributes-values files extraction (True/False) size_reduction (bool): delete ptools-local data at the end (True/False) @@ -72,16 +73,16 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None sys.exit('To use --ignore-error/ignore_error, you need to use the --patho/patho_inference argument.') # Check if taxon_file is used with patho_inference. - if taxon_file and not patho_inference: - sys.exit('To use --taxon-file/taxon_file, you need to use the --patho/patho_inference argument.') + if (taxon_file and not patho_inference) and (taxon_file and not input_folder): + sys.exit('To use --taxon-file/taxon_file, you need to use the --patho/patho_inference argument. Or you can use it with the -f argument to create the taxon file from data.') #Check if patho_operon_predictor is used with patho_inference. if patho_operon_predictor and not patho_inference: sys.exit('To use --op/patho_operon_predictor, you need to use the --patho/patho_inference argument.') - #Check if patho_citations is used with patho_inference. - if patho_citations and not patho_inference: - sys.exit('To use --nc/patho_citations, you need to use the --patho/patho_inference argument.') + #Check if no_download_articles is used with patho_inference. + if no_download_articles and not patho_inference: + sys.exit('To use --nc/no_download_articles, you need to use the --patho/patho_inference argument.') # Use the number of cpu given by the user or 1 CPU. if number_cpu: @@ -93,8 +94,21 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None number_cpu_to_use = 1 mpwt_pool = Pool(processes=number_cpu_to_use) + # Create taxon file in the input folder. + if taxon_file and input_folder and not patho_inference: + taxon_file_pathname = input_folder + '/taxon_id.tsv' + if os.path.exists(taxon_file_pathname): + sys.exit('taxon ID file (' + taxon_file_pathname + ') already exists.') + else: + taxon_ids = read_taxon_id(input_folder) + with open(taxon_file_pathname, 'w') as taxon_id_file: + taxon_id_writer = csv.writer(taxon_id_file, delimiter='\t') + taxon_id_writer.writerow(['species', 'taxon_id']) + for species, taxon_id in taxon_ids.items(): + taxon_id_writer.writerow([species, taxon_id]) + # Turn off loading of pubmed entries. - if patho_citations: + if no_download_articles: utils.pubmed_citations(activate_citations=False) # Check input folder and create input files for PathoLogic. @@ -217,7 +231,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None mpwt_pool.join() # Turn on loading of pubmed entries. - if patho_citations: + if no_download_articles: utils.pubmed_citations(activate_citations=True) end_time = time.time() diff --git a/mpwt/pathologic_input.py b/mpwt/pathologic_input.py index 305a757..43c7493 100755 --- a/mpwt/pathologic_input.py +++ b/mpwt/pathologic_input.py @@ -394,6 +394,55 @@ def create_dats_and_lisp(run_folder, taxon_file): return all([os.path.isfile(organism_dat), os.path.isfile(genetic_dat), check_lisp_file]) +def read_taxon_id(run_folder): + taxon_ids = {} + + for input_folder in os.listdir(run_folder): + for input_file in os.listdir(run_folder + '/' + input_folder): + if '.gbk' in input_file: + gbk_pathname = run_folder + '/' + input_folder + '/' + input_file + # Take the species name and the taxon id from the genbank file. + with open(gbk_pathname, "r") as gbk: + # Take the first record of the genbank (first contig/chromosome) to retrieve the species name. + first_seq_record = next(SeqIO.parse(gbk, "genbank")) + # Take the source feature of the first record. + # This feature contains the taxon ID in the db_xref qualifier. + src_features = [feature for feature in first_seq_record.features if feature.type == "source"] + for src_feature in src_features: + try: + src_dbxref_qualifiers = src_feature.qualifiers['db_xref'] + for src_dbxref_qualifier in src_dbxref_qualifiers: + if 'taxon:' in src_dbxref_qualifier: + taxon_id = src_dbxref_qualifier.replace('taxon:', '') + except KeyError: + logger.info('No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'.format(gbk_pathname)) + + elif '.gff' in input_file: + gff_pathname = run_folder + '/' + input_folder + '/' + input_file + + # Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature. + try: + region_feature = [feature for feature in DataIterator(gff_pathname) if feature.featuretype == 'region'][0] + except IndexError: + raise IndexError('No region feature in the GFF file of {0}, GFF file must have region features.'.format(input_folder)) + + try: + region_feature.attributes['Dbxref'] + except KeyError: + raise KeyError('No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.'.format(input_folder)) + + for dbxref in region_feature.attributes['Dbxref']: + if 'taxon' in dbxref: + taxon_id = dbxref.split('taxon:')[1] + + elif '.pf' in input_file: + logger.info('No taxon ID associated to a PathoLogic Format. {0} will have a missing taxon_id'.format(input_folder)) + taxon_id = "missing" + taxon_ids[input_folder] = taxon_id + + return taxon_ids + + def pwt_input_files(multiprocess_input): """ Check if files needed by Pathway Tools are available, if not create them. diff --git a/mpwt/results_check.py b/mpwt/results_check.py index ad1b5f7..a2d3f1b 100755 --- a/mpwt/results_check.py +++ b/mpwt/results_check.py @@ -70,7 +70,7 @@ def check_pwt(multiprocess_inputs, patho_log_folder): if patho_log_folder: patho_error_file.write(line) - if 'Build done.' in line: + if 'Build done.' in line or 'PGDB build done.' in line: if patho_log_folder: patho_error_file.write(line) resume_inference_line = next(input_file) diff --git a/mpwt/utils.py b/mpwt/utils.py index 3c8b5c1..e89d61a 100755 --- a/mpwt/utils.py +++ b/mpwt/utils.py @@ -379,17 +379,18 @@ def pubmed_citations(activate_citations): """ Activate or deactivate loading of PubMed citations. - TODO: update this function with the argument from the new version of Pathway Tools + Args: + activate_citations (bool): boolean to indicate if you want to activate or not the downlaod of Pubmed entries. """ ptools_init_filepath = find_ptools_path() + '/ptools-init.dat' new_ptools_file = "" with open(ptools_init_filepath, 'r') as ptools_init_file: for line in ptools_init_file.read().split('\n'): - if '##download-pubmed-citations' in line: + if '###Batch-PathoLogic-Download-Pubmed-Entries?' in line: if activate_citations: - line = line.replace('N', 'Y') + line = line.replace('F', 'T') else: - line = line.replace('Y', 'N') + line = line.replace('T', 'F') if line != '': new_ptools_file = new_ptools_file + line + '\n' else: From e0d450ba5c6ce42676893ea411b49d0e5bbe3185 Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Tue, 7 Jan 2020 13:56:17 +0100 Subject: [PATCH 12/15] Add errors and warnings counts in log. --- mpwt/results_check.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/mpwt/results_check.py b/mpwt/results_check.py index a2d3f1b..a7e5c66 100755 --- a/mpwt/results_check.py +++ b/mpwt/results_check.py @@ -37,7 +37,7 @@ def check_pwt(multiprocess_inputs, patho_log_folder): patho_error_file = open(patho_error_pathname, 'w') patho_resume_file = open(patho_resume_pathname, 'w') patho_resume_writer = csv.writer(patho_resume_file, delimiter='\t', lineterminator='\n') - patho_resume_writer.writerow(['species', 'gene_number', 'protein_number', 'pathway_number', 'reaction_number', 'compound_number']) + patho_resume_writer.writerow(['species', 'gene_number', 'protein_number', 'pathway_number', 'reaction_number', 'compound_number', 'pwt_non_fatal_error', 'pwt_warning']) failed_inferences = [] passed_inferences = [] @@ -53,11 +53,17 @@ def check_pwt(multiprocess_inputs, patho_log_folder): patho_error_file.write('\n') fatal_error_index = None + non_fatal_error_count = 0 + warning_count = 0 if os.path.exists(patho_log): with open(patho_log, 'r') as input_file: for index, line in enumerate(input_file): - if 'fatal error' in line or 'Error' in line: + if ';;; Error:' in line: + non_fatal_error_count += 1 + if 'Warning:' in line: + warning_count += 1 + if 'fatal error' in line: fatal_error_index = index if species not in failed_inferences: failed_inferences.append(species) @@ -75,12 +81,18 @@ def check_pwt(multiprocess_inputs, patho_log_folder): patho_error_file.write(line) resume_inference_line = next(input_file) patho_error_file.write(resume_inference_line) + if non_fatal_error_count > 0: + non_fatal_error_line = 'Number of non fatal errors: ' + str(non_fatal_error_count) + '. More information in ' + patho_log + '.\n' + patho_error_file.write(non_fatal_error_line) + if warning_count > 0: + warning_line = 'Number of warning: ' + str(warning_count) + '. More information in ' + patho_log + '.\n' + patho_error_file.write(warning_line) gene_number = int(resume_inference_line.split('PGDB contains ')[1].split(' genes')[0]) protein_number = int(resume_inference_line.split('genes, ')[1].split(' proteins')[0]) pathway_number = int(resume_inference_line.split('proteins, ')[1].split(' base pathways')[0]) reaction_number = int(resume_inference_line.split('base pathways, ')[1].split(' reactions')[0]) compound_number = int(resume_inference_line.split('reactions, ')[1].split(' compounds')[0]) - patho_resume_writer.writerow([species, gene_number, protein_number, pathway_number, reaction_number, compound_number]) + patho_resume_writer.writerow([species, gene_number, protein_number, pathway_number, reaction_number, compound_number, non_fatal_error_count, warning_count]) passed_inferences.append(species) if species not in passed_inferences and species not in failed_inferences: From 000e48f7499d8ef2c81836dd9869b6e14d8db742 Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Tue, 7 Jan 2020 14:48:09 +0100 Subject: [PATCH 13/15] Add error message if --nc is used with wrong PWT version (issue #34). --- mpwt/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mpwt/utils.py b/mpwt/utils.py index e89d61a..8144812 100755 --- a/mpwt/utils.py +++ b/mpwt/utils.py @@ -384,9 +384,12 @@ def pubmed_citations(activate_citations): """ ptools_init_filepath = find_ptools_path() + '/ptools-init.dat' new_ptools_file = "" + + download_pubmed_entries_parameter = None with open(ptools_init_filepath, 'r') as ptools_init_file: for line in ptools_init_file.read().split('\n'): if '###Batch-PathoLogic-Download-Pubmed-Entries?' in line: + download_pubmed_entries_parameter = True if activate_citations: line = line.replace('F', 'T') else: @@ -396,5 +399,8 @@ def pubmed_citations(activate_citations): else: new_ptools_file = new_ptools_file + line + if not download_pubmed_entries_parameter: + sys.exit('There is no Batch-PathoLogic-Download-Pubmed-Entries parameter in ' + ptools_init_filepath +'. To use --nc/no_download_articles, mpwt needs Pathway Tools 23.5 or higher.') + with open(ptools_init_filepath, 'w') as ptools_init_file: ptools_init_file.write(new_ptools_file) From 53025e4d211a474792f6b0c097df8fb47174d6cb Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Thu, 9 Jan 2020 14:26:02 +0100 Subject: [PATCH 14/15] Uncomment argument in ptools-init.dat file to use them. --- mpwt/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mpwt/utils.py b/mpwt/utils.py index 8144812..825f9d7 100755 --- a/mpwt/utils.py +++ b/mpwt/utils.py @@ -388,12 +388,14 @@ def pubmed_citations(activate_citations): download_pubmed_entries_parameter = None with open(ptools_init_filepath, 'r') as ptools_init_file: for line in ptools_init_file.read().split('\n'): - if '###Batch-PathoLogic-Download-Pubmed-Entries?' in line: + if 'Batch-PathoLogic-Download-Pubmed-Entries?' in line: + if '#' in line: + line = line.replace('#', '') download_pubmed_entries_parameter = True if activate_citations: - line = line.replace('F', 'T') + line = line.replace('nil', 'T') else: - line = line.replace('T', 'F') + line = line.replace('T', 'nil') if line != '': new_ptools_file = new_ptools_file + line + '\n' else: From 98966970b67c84d2eb7b503317a0160ac9a9088a Mon Sep 17 00:00:00 2001 From: Arnaud Belcour Date: Thu, 9 Jan 2020 15:25:03 +0100 Subject: [PATCH 15/15] Move to 0.5.3. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5d95990..bd15101 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setup(name='mpwt', description='Multiprocessing for Pathway Tools', long_description=long_description, - version='0.5.2', + version='0.5.3', url='https://github.com/AuReMe/mpwt', author='A. Belcour', author_email='arnaud.belcour@gmail.com',