-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmutation_calling.py
140 lines (126 loc) · 8.18 KB
/
mutation_calling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import argparse
from order.calling import generate_calling_file
from order.utils.output_formatters import load_or_create_calling, \
generate_output_file, \
load_or_create_simulations_file, \
save_calling_file
import numpy
if '__main__' == __name__:
parser = argparse.ArgumentParser(description='Analyses hist-pairs file')
parser.add_argument('-n', '--normalize', type=bool, dest='normalize', default=True, help='path for tab delimited histpath-dup_id file')
parser.add_argument('-t', '--truncate', type=bool, dest='truncate', default=False, help='Cleans noise below p (default p=.050)')
parser.add_argument('-c', '--cutpeak', type=bool, dest='cutpeak', default=False, help='Cleans anything with n zen zeros between it and the maximum (default n=1)')
parser.add_argument('-e', '--trimextremes', type=bool, dest='trimextremes', default=False, help='ignores extreme reads')
parser.add_argument('-o', '--outputfile', type=str, dest='output', default='output.tab.gz', help='path of output file')
parser.add_argument('-i', '--inputfile', type=str, dest='input', default='input.taqb.gz', help='path of input file')
parser.add_argument('-r', '--minreads', type=int, dest='minreads', default=10, help='minimal reads threshold')
parser.add_argument('-mc', '--maxcycles', type=int, dest='maxcycles', default=50, help='maximum cycles to simulate')
parser.add_argument('-nc', '--mincycles', type=int, dest='mincycles', default=0, help='minimum cycles to simulate')
parser.add_argument('-s', '--maxscore', type=float, dest='maxscore', default=999.9, help='max (worst) score allowed')
parser.add_argument('-m', '--method', type=str, dest='method', default='emd', help='method for histogram comparison')
parser.add_argument('-v', '--verbose', type=bool, dest='verbose', default=False, help='prints additional calling information columns in the output table')
parser.add_argument('-sf', '--simulationsfile', type=str, dest='simulationsfile', default='sim_hists.pickle', help='path of preprocessed simulations file')
parser.add_argument('-cf', '--callingfile', type=str, dest='callingfile', default='calling.pickle', help='path of preprocessed calling file')
parser.add_argument('-sm', '--shiftmargins', type=int, dest='shiftmargins', default=15, help='number of attempted shifts to either side of the histogram median')
parser.add_argument('-bd', '--simulationmethod', type=str, dest='simulationmethod', default='bin', help='method of MS histogram simulation')
parser.add_argument('-w', '--workers', type=int, dest='workers', default=1, help='Number of working processes')
parser.add_argument('-ma', '--maxalleles', type=int, dest='max_alleles', default=2, help='maximum number of alleles to consider. *This drastically increase runtime!')
parser.add_argument('-sd', '--sampledepth', type=int, dest='sample_depth', default=10000, help='in non-deterministic simulations, this determines the sampling depth of the model histogram')
parser.add_argument('-ml', '--maxmslength', type=int, dest='max_ms_length', default=60, help='maximum ms length to consider')
parser.add_argument('-md', '--mediandistance', type=int, dest='max_distance_from_median', default=10, help='maximum distance from the median value to take into account')
args = parser.parse_args()
normalize = args.normalize
truncate = args.truncate
cutpeak = args.cutpeak
trim_extremes = args.trimextremes
output_file = args.output
input_file = args.input
reads_threshold = args.minreads
score_threshold = args.maxscore
cycles_threshold = args.maxcycles
min_cycles = args.mincycles
shift_margins = args.shiftmargins
max_alleles = args.max_alleles
sample_depth = args.sample_depth
max_ms_length = args.max_ms_length
max_distance_from_median = args.max_distance_from_median
meth = args.method
verbose = args.verbose
sim_method = args.simulationmethod
SIMULATED_HISTS_PATH = args.simulationsfile
SIGNALS_CALLING_PATH = args.callingfile
workers = args.workers
# AC original (mat)#
ups = [numpy.poly1d([0.00026892, -0.00205025])]
dws = [numpy.poly1d([0.00191615, -0.01174076]),
numpy.poly1d([0.00027444, -0.00220836]),
numpy.poly1d([0.0001768, -0.00199328])]
# AC new#
# ups = [numpy.poly1d([0.00019203, -0.00092681])]
# dws = [numpy.poly1d([0.0022604, -0.01658698]),
# numpy.poly1d([0.00039999, -0.00631779]),
# numpy.poly1d([0.00026061, -0.00287503])]
# A (mat)#
# ups = [numpy.poly1d([0.00140512, -0.01120694])]
# dws = [numpy.poly1d([0.00400177, -0.0356642]),
# numpy.poly1d([0.0006561, -0.0092359]),
# numpy.poly1d([0.00025434, -0.0033739])]
# A (bon) #0.100951764511
# ups = [lambda d: 6.50886922e-06*d**2 + 2.25801329e-04*d + 1.13021568e-03]
# dws = [lambda d: 1.04054253e-04*d**2 - 4.53067136e-05*d - 2.32479554e-04]
# G (mat)#
# ups = [numpy.poly1d([0., -0.00926266])]
# dws = [numpy.poly1d([0.005, -0.03624266]),
# numpy.poly1d([0.00197226, -0.01723455]),
# numpy.poly1d([0.0003741, -0.00450624])]
# G (bon)#0.0651886239479
# ups = [lambda d: 1.52697584e-24*d**2 - 1.00000000e-02*d + 1.50925572e-02 ]
# dws = [lambda d: 4.69878494e-05*d**2 + 8.87353541e-03*d - 7.61676267e-02]
# G (bon2)# 0.0708915005001
# ups = [lambda d: 6.59598553e-05*d**2 - 1.39704635e-03*d - 6.95178768e-03]
# dws = [lambda d: 4.73038399e-04*d**2 - 2.10212102e-03*d - 1.05331741e-02]
#'AC'
# ups = [lambda d: 0.0*d**2 + 3.63701136e-04*d - 3.49979857e-03]
# dws = [lambda d: 2.57826596e-05*d**2 + 1.41367146e-03*d - 9.06688104e-03]
# 'AG'
# ups = [lambda d: 0.0*d**2 + 4.78854812e-04*d - 3.47850504e-03]
# dws = [lambda d: 7.45380877e-05*d**2 + 8.41224155e-04*d - 9.37967125e-03]
sim_hists = load_or_create_simulations_file(SIMULATED_HISTS_PATH,
method=sim_method,
max_cycles=cycles_threshold,
ups=ups,
dws=dws,
max_ms_length=max_ms_length,
sample_depth=sample_depth,
max_alleles=max_alleles,
normalize=normalize,
truncate=truncate,
cut_peak=cutpeak,
trim_extremes=trim_extremes)
calling = load_or_create_calling(SIGNALS_CALLING_PATH)
calling = generate_calling_file(input_file,
sim_hists,
calling,
method=meth,
reads_threshold=reads_threshold,
workers=workers,
score_threshold=score_threshold,
min_cycles=min_cycles,
max_cycles=cycles_threshold,
nsamples=None,
normalize=normalize,
truncate=truncate,
cut_peak=cutpeak,
trim_extremes=trim_extremes,
shift_margins=shift_margins,
max_distance_from_median=max_distance_from_median,
max_alleles=max_alleles,
max_ms_length=max_ms_length
)
save_calling_file(calling, SIGNALS_CALLING_PATH)
generate_output_file(input_file,
output_file,
calling,
reads_threshold=reads_threshold,
score_threshold=score_threshold,
verbose=verbose)