-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathevaluate_motifs.py
104 lines (82 loc) · 3.38 KB
/
evaluate_motifs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#! /usr/bin/env python3
'''
A script to apply the given motifs to the specified data set, calculating
the accuracy, sensitivity, specificity and Matthews' correlation coefficient.
'''
import csv
import re
import click
from utilities import extract_seqs
def get_matches(motif, seqs):
'''
Apply the given motif to the input sequences, returning the successful
matches.
Args:
motif (str): The motif pattern to apply.
seqs (list): The list of sequences.
Returns:
list: The sequences matching the input motif.
'''
regex = re.compile(motif)
return [s for s in seqs if regex.match(s)]
def evaluate_motifs(motif_file, pos_seq_file, neg_seq_file, central_res,
seq_len, output_file):
'''
Evaluate the given motifs by matching against the positive and negative
sequence files (prealigned or fasta format) and computing various
evaluation metrics.
Args:
motif_file (str): The path to the file containing motif patterns,
in CSV format with a column named 'motif'.
pos_seq_file (str): The path to the file containing positive
sequences, i.e. those which are known to bear
the modification.
neg_seq_file (str): The path to the file containing negative
sequences, i.e. those which are known not to bear
the modification.
central_res (str): The residue on which sequences should be centered.
seq_len (int): The length/target length of the sequences.
output_file (str): The path to the file to which to write the metric
results.
'''
with open(motif_file) as fh:
reader = csv.DictReader(fh)
motifs = [l['motif'] for l in reader]
def _extract_seqs(ifile):
return extract_seqs(ifile, central_res, seq_len, write=False)
pos_seqs = _extract_seqs(pos_seq_file)
neg_seqs = _extract_seqs(neg_seq_file)
motif_scores = []
for motif in motifs:
pos_matches = get_matches(motif, pos_seqs)
neg_matches = get_matches(motif, neg_seqs)
tp = len(pos_matches)
fp = len(neg_matches)
tn = len(neg_seqs) - fp
fn = len(pos_seqs) - tp
acc = (tp + tn) / (tp + tn + fp + fn)
sen = tp / (tp + fn)
spec = tn / (fp + tn)
mcc = ((tp * tn) - (fp * fn)) /\
(((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5)
motif_scores.append((motif, acc, sen, spec, mcc))
with open(output_file, 'w', newline='') as fh:
writer = csv.writer(fh)
writer.writerow(['motif', 'accuracy', 'sensitivity', 'specificity',
'mcc'])
writer.writerows(motif_scores)
@click.command()
@click.argument('motif-file')
@click.argument('pos-sequence-file')
@click.argument('neg-sequence-file')
@click.argument('central-residue')
@click.argument('seq-length', type=int)
@click.option('--output-file', '-o',
help='The file to which to save the calculations',
default='motif_analysis_results.csv')
def main(motif_file, pos_sequence_file, neg_sequence_file, central_residue,
seq_length, output_file):
evaluate_motifs(motif_file, pos_sequence_file, neg_sequence_file,
central_residue, seq_length, output_file)
if __name__ == '__main__':
main()