-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsummarize_by_project.py
executable file
·480 lines (406 loc) · 21.6 KB
/
summarize_by_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
#!/usr/bin/env python3
import sys, os
import json
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from math import isnan
from statistics import mean, stdev
import yaml, yamlloader
from illuminatus.Formatters import rat
""" This script builds the Project Summary table on the overview pages.
See also summarize_lane_contents.py and summarize_yield.py
It needs to get info from the following places:
all Stats.json (I could have stats_json_aggregator.py read the extra info but
it seems better to have this script read the stats directly)
pipeline/sample_summary.yml (Because this knows what the projects are called!)
Output will be in tsv or mqc format following the patter in summarize_lane_contents.py
"""
def parse_args():
description = """This script is part of the Illuminatus pipeline.
It makes a fragments-per-project-per-lane matrix for a run.
Output may be in MQC or TSV format or raw YAML. MQC is suitable for MultiQC custom
content - http://multiqc.info/docs/#custom-content, while TSV/YAML is handy
for debugging.
"""
a = ArgumentParser( description = description,
formatter_class = ArgumentDefaultsHelpFormatter)
a.add_argument("--project_name_list",
help="Supply a comma-separated list of project names." +
" Normally they would be obtained from sample_summary.yml." +
" You can equivalently setenv PROJECT_NAME_LIST." )
a.add_argument("--sample_summary",
help="Read project names from the supplied sample_summary.yml")
a.add_argument("--from_yml",
help="Get the info from the supplied YAML file, not by" +
" scanning the directory and the LIMS." )
a.add_argument("--yml",
help="Output in YAML format to the specified file (- for stdout)." )
a.add_argument("--mqc",
help="Output for MultiQC to the specified file (- for stdout)." )
a.add_argument("--tsv",
help="Output in TSV format to the specified file (- for stdout)." )
# I want to be able to output multiple MQC files in one go...
for metric in "frag lib bal".split():
for pool in "pool proj".split():
a.add_argument("--mqc_{}_{}".format(metric, pool),
help="Output specific stats for MultiQC to a file")
a.add_argument("--metric",
help="Output Fragments, Libraries, or Balance (ignored for --yml output)")
a.add_argument("--by_pool", action="store_true",
help="Collate by pool rather than by project (ignored for --yml output)")
a.add_argument("--print_even_if_empty",
help="Disable the sanity check that stops us producing useless MQC tables.")
a.add_argument("json", type=str, nargs='+',
help="Stats to be digested.")
return a.parse_args()
def main(args):
# First order of business is to digest all the Stats.json files
# This gives me a dict {(sample_id, lane) -> fragments}
# but I can't robustly aggregate these by project until I have the sample_summary.yml
all_stats_from_json = extract_stats_from_json(args.json)
# A dummy sample summary, then try to load the real one.
sample_summary = { 'ProjectInfo': dict(), 'Lanes': list() }
if args.sample_summary:
with open(args.sample_summary) as yfh:
sample_summary = yaml.safe_load(yfh)
# A list of names can be provided on the command line...
project_name_list = (args.project_name_list or os.environ.get('PROJECT_NAME_LIST', '')).split(',')
# Now a mapping of project_number->name, combining both. The name list takes precedence.
# Note that info in the .yml looks like this:
# ProjectInfo:
# '11200':
# name: 11200_Hickey_John
# url: https://www.wiki.ed.ac.uk/display/GenePool/11200_Hickey_John
project_to_name = { k: v.get('name', k) for k, v in sample_summary['ProjectInfo'].items() }
project_to_name.update( [(pn.split('_')[0], pn) for pn in project_name_list if '_' in pn] )
# Resolve the samples down to projects with the mapping in sample_summary, or else
# by guessing (samples generally start with the project number!).
# This gets us from {lane -> {pool_library -> fragments}} to {(lane/project) -> fragments}
all_stats_by_project = aggregate_by_project(all_stats_from_json, sample_summary['Lanes'])
all_stats_by_pool = aggregate_by_pool(all_stats_from_json)
# At this point it's also useful to be able to explicitly get the pools by project
pools_per_project = get_pools_per_project(project_to_name.keys(), sample_summary['Lanes'])
#See where we want to put it (loop nicked from summarize_lane_contents)
for dest, formatter in [ ( args.yml, output_yml ),
( args.mqc, output_mqc ),
( args.tsv, output_tsv ) ]:
if dest:
if dest == '-':
formatter(all_stats_by_pool, all_stats_by_project, project_to_name, pools_per_project, args, sys.stdout)
else:
with open(dest, 'w') as ofh:
formatter(all_stats_by_pool, all_stats_by_project, project_to_name, pools_per_project, args, ofh)
#Also allow multiple MQC outputs
for metric in "frag lib bal".split():
for pool in "pool proj".split():
dest = vars(args)["mqc_{}_{}".format(metric, pool)]
# Manipulate args because I'm lazy
args.by_pool = (pool == "pool")
args.metric = dict(frag='Fragments', lib='Libraries', bal="Balance")[metric]
if dest == '-':
output_mqc(all_stats_by_pool, all_stats_by_project, project_to_name, pools_per_project, args, sys.stdout)
elif dest:
with open(dest, 'w') as ofh:
output_mqc(all_stats_by_pool, all_stats_by_project, project_to_name, pools_per_project, args, ofh)
def cov(counts_list):
""" Standard CoV (barcode balance) calculation
"""
return rat( stdev(counts_list), mean(counts_list) )
def get_pools_per_project(project_list, lanes_summary):
""" Tots up the pools for each project. When calculating barcode balance by pool
there may be a "NoPool" which has samples from multiple projects, so I can't reliably
infer this info later. This works it out explicitly.
Note that if a pool is loaded on multiple lanes it will appear multiple times.
"""
return { p : [ pool
for lc in [ l['Contents'] for l in lanes_summary ]
for proj, pool_dict in lc.items()
for pool in pool_dict
if proj == p ]
for p in project_list }
def aggregate_by_project(all_stats_by_library, lanes_summary):
""" Gets us from {lane -> {pool_library -> fragments}} to three dicts
of {(lane/project) -> value} by using the info in lanes_summary, which
is the 'Lanes' section from the sample_summary.yml
"""
# This will map {(lane/project) -> [list of counts]}
counts_by_lane_proj = dict()
counts_by_lane = dict()
counts_by_proj = dict()
# Work one lane at a time, driven by the keys in all_stats_by_library
for lane, reads_by_library in all_stats_by_library.items():
# Get the info regarding this lane from sample_summary.yml
lane_contents = [ l['Contents'] for l in lanes_summary if l['LaneNumber'] == lane ]
# lane_contents is now a list of (hopefully exactly one!) dicts.
# Now take a deep breath and let's unpack it to get a lookup table!
# Of course I'm reconstructing the name from the sample sheet here but I regard the data in the YAML
# to be already internalised.
# Since I'm no longer munging pool names ('NoPool' -> '') this mapping should be reliable.
library_to_project = { "{}__{}".format(pool, lib): proj
for lc in lane_contents
for proj, pool_dict in lc.items()
for pool, lib_list in pool_dict.items()
for lib in lib_list }
# Now translate and sum
for library, fragments in reads_by_library.items():
# If the lookup fails, use the first 5 chars of the pool name
# No, that's dangerous. I'll do this instead...
project = library_to_project.get(library, 'Unknown')
counts_by_lane_proj.setdefault('{}/{}'.format(lane, project),[]).append(fragments)
counts_by_lane.setdefault(lane,[]).append(fragments)
counts_by_proj.setdefault(project,[]).append(fragments)
# Now transform those lists to get the three values we want - fragments, libs, balance
# If there is only one library the balance should be absent but where there are several library
# and no reads it will be .nan
# Add the balance for the whole lane and for the project over all lanes.
# as we can't calculate this from the aggregate figures.
res = dict( Fragments = {k: sum(v) for k, v in counts_by_lane_proj.items()},
Libraries = {k: len(v) for k, v in counts_by_lane_proj.items()},
Balance = {k: cov(v) for k, v in counts_by_lane_proj.items() if len(v) > 1},
LBalance = {k: cov(v) for k, v in counts_by_lane.items() if len(v) > 1},
PBalance = {k: cov(v) for k, v in counts_by_proj.items() if len(v) > 1} )
# For completenes poke in an overall barcode balance, though this is likely meaningless
# when aggregated across projects.
if sum( len(v) for v in counts_by_lane.values() ) > 1:
res['LBalance']['All'] = cov([v for l in counts_by_lane.values() for v in l])
return res
def aggregate_by_pool(all_stats_by_library):
""" Gets us from {lane -> {pool_library -> fragments}} to three dicts of
{(lane/project) -> value}. Like aggregate_by_project() but we have no need of
a library->project mapping in this case.
"""
# This will map {(lane/project) -> [list of counts]}
counts_by_lane_pool = dict()
counts_by_lane = dict()
counts_by_pool = dict()
for lane, reads_by_library in all_stats_by_library.items():
for library, fragments in reads_by_library.items():
pool = library.split('__', 1)[0]
counts_by_lane_pool.setdefault('{}/{}'.format(lane, pool),[]).append(fragments)
counts_by_lane.setdefault(lane,[]).append(fragments)
counts_by_pool.setdefault(pool,[]).append(fragments)
# Now transform those lists to get the three values we want - fragments, libs, balance
# If there is only one library the balance should be absent but where there are several library
# and no reads it will be .nan
# Add the balance for the whole lane and for the pool over all lanes.
# as we can't calculate this from the aggregate figures.
res = dict( Fragments = {k: sum(v) for k, v in counts_by_lane_pool.items()},
Libraries = {k: len(v) for k, v in counts_by_lane_pool.items()},
Balance = {k: cov(v) for k, v in counts_by_lane_pool.items() if len(v) > 1},
LBalance = {k: cov(v) for k, v in counts_by_lane.items() if len(v) > 1},
PBalance = {k: cov(v) for k, v in counts_by_pool.items() if len(v) > 1} )
# For completenes poke in an overall barcode balance. This might be meaningful with, say
# the Hickey pools where we'd like a consistent number of reads across all samples.
if sum( len(v) for v in counts_by_lane.values() ) > 1:
res['LBalance']['All'] = cov([v for l in counts_by_lane.values() for v in l])
return res
def output_yml(all_stats_by_pool, all_stats_by_project, project_to_name, pools_per_project, args, fh):
""" Super-simple YAML dumper. Ignores args.
"""
struct = dict( stats_by_pool = all_stats_by_pool,
stats_by_project = all_stats_by_project,
project_to_name = project_to_name,
pools_per_project = pools_per_project )
print( yaml.dump( struct,
Dumper = yamlloader.ordereddict.CSafeDumper,
default_flow_style = False ),
file = fh,
end = '')
def output_mqc(all_stats_by_pool, all_stats_by_project, project_to_name, pools_per_project, args, fh):
"""This also happens to be YAML but is specifically for display
in MultiQC. The filename should end in _mqc.yaml (not .yml) in
order to be picked up.
I was trying to generate a table with radio buttons that switch views, but
that doesn't work. Instead I patched MultiQC to put multiple Custom Content
tables into a section.
"""
mqc_out = dict(
id = 'matrix',
section_name = 'Project Summary',
description = 'Summary of reads and barcode balance broken out by project and pool', # Will only be shown once.
plot_type = 'table',
pconfig = { 'title': '', 'sortRows': True, 'no_beeswarm': True },
data = {},
headers = {},
)
# Load up the data dict - this bit is copied from output.tsv
if args.by_pool:
plabel = 'Pool'
# The behaviour is that if args.metric is invalid we get a failure here
all_stats = all_stats_by_pool
stats_for_metric = all_stats[args.metric]
p_to_name = dict()
else:
plabel = 'Project'
all_stats = all_stats_by_project
stats_for_metric = all_stats[args.metric]
p_to_name = project_to_name
if args.metric == 'Fragments':
mqc_out['id'] += '1_fragments_by' + plabel
mqc_out['pconfig']['title'] = "Fragments by " + plabel
agg = sum
agg_label = 'Total'
fmt = "{:,}"
if args.metric == 'Libraries':
mqc_out['id'] += '2_fragments_by' + plabel
mqc_out['pconfig']['title'] = "Library Counts by " + plabel
agg = sum
agg_label = 'Total'
fmt = "{:,}"
if args.metric == 'Balance':
mqc_out['id'] += '3_fragments_by' + plabel
mqc_out['pconfig']['title'] = "Barcode Balance by " + plabel
agg = None
agg_label = 'Overall'
fmt = "{:,.03}"
ps = sorted(set( [k.split('/',1)[1] for k in stats_for_metric] +
[k for k in p_to_name] ))
lanes = sorted(set( [k.split('/',1)[0] for k in stats_for_metric] ))
for p in ps:
proj_label = p_to_name.get(p, p)
mqc_out['data'][proj_label] = { 'lane{}'.format(l): stats_for_metric.get('{}/{}'.format(l,p))
for l in lanes
if '{}/{}'.format(l,p) in stats_for_metric }
# Add a total if there were >1 lanes
if len(lanes) > 1:
if agg:
p_total = agg(stats_for_metric.get('{}/{}'.format(l,p), 0) for l in lanes)
else:
# Must be pre-calculated
p_total = all_stats['P'+args.metric].get(p, 0)
mqc_out['data'][proj_label]['total'] = p_total
# Add a summary row. Since all our projects and pools start with a number or 'N' this should
# stay at the bottom so long as the title is 'Total' or 'Overall'.
if agg:
# Add them up. This time there will be no blanks (maybe zeros)
l_totals = [ agg(stats_for_metric.get('{}/{}'.format(l,p), 0) for p in ps) for l in lanes ]
grand_total = agg(l_totals)
else:
# Get the pre-calculated values
l_totals = [ all_stats['L'+args.metric].get(l, 0) for l in lanes ]
grand_total = all_stats['L'+args.metric].get('All', 0)
# Only print the summary row if there was more than one project/pool.
if len(ps) > 1:
mqc_out['data'][agg_label] = { 'lane{}'.format(l): v for l, v in zip(lanes, l_totals) }
if len(lanes) > 1:
mqc_out['data'][agg_label]['total'] = grand_total
# That's the data loaded. Now to set up the headers.
mqc_out['pconfig']['col1_header'] = plabel
for l in lanes:
mqc_out['headers']['lane{}'.format(l)] = dict( min = 0,
max = grand_total if not isnan(grand_total) else None,
format = fmt,
title = "Lane {}".format(l) )
if len(lanes) > 1:
mqc_out['headers']['total'] = dict( format = fmt,
title = agg_label )
# Now decide if we want to skip this. Useless tables are (I think):
# Any balance if no lane has more than one sample
# Libs/Frags by pool if no project has more than one pool
# Balance by project if ditto
# Library counts by project if there is only one project
reason_to_skip = None
pppv = pools_per_project.values()
if not args.print_even_if_empty:
if not stats_for_metric:
reason_to_skip = "No stats for {}".format(args.metric)
elif args.by_pool and \
(args.metric != 'Balance') and \
not any(len(set(ppp)) > 1 for ppp in pppv):
reason_to_skip = "No project here has more than one pool, so skipping per-pool summary"
# In run 180412_M01270_0454_000000000-D3R8F we see that a pool may span multiple projects(!),
# so be careful.
elif not args.by_pool and \
(args.metric == 'Balance') and \
not any(len(set(ppp)) > 1 for ppp in pppv) and \
not sum(len(set(l)) for l in pppv) > len(set(i for l in pppv for i in l)):
reason_to_skip = "No project here has more than one pool, and no pools span projects, so skipping per-project summary"
elif (not args.by_pool) and (args.metric == 'Libraries') and not len(pools_per_project) > 1:
reason_to_skip = "There is only one project"
if reason_to_skip:
print("reason_to_skip: '{}'".format(reason_to_skip), file=fh)
print("data: []", file=fh)
else:
# That's all she wrote. Print it out...
print( yaml.dump( mqc_out,
Dumper = yamlloader.ordereddict.CSafeDumper,
default_flow_style = False ),
file = fh,
end = '' )
def output_tsv(all_stats_by_pool, all_stats_by_project, project_to_name, pools_per_project, args, fh):
""" Output as TSV.
Projects (or pools) in rows, lanes in columns.
"""
if args.by_pool:
plabel = 'Pool'
all_stats = all_stats_by_pool
p_to_name = dict()
else:
plabel = 'Project'
all_stats = all_stats_by_project
p_to_name = project_to_name
# We could have projects in project_to_name that are not in the Stats
# and vice versa. Include them all and sort by ID.
ps = sorted(set( [k.split('/',1)[1] for k in all_stats['Libraries']] +
[k for k in p_to_name] ))
lanes = sorted(set( [k.split('/',1)[0] for k in all_stats['Libraries']] ))
def _str(x):
if type(x) is float:
return "{:,.3}".format(x)
elif type(x) is int:
return "{:,}".format(x)
else:
return str(x)
for tspec in [ dict(key='Fragments', agg=sum, agg_label='Total'),
dict(key='Libraries', agg=sum, agg_label='Total'),
dict(key='Balance', agg=None, agg_label='Overall'),
]:
if args.metric and args.metric != tspec['key']:
continue
if not args.metric:
print("===" + tspec['key'] + "===")
# Header:
header = [plabel] + ['Lane{}'.format(n) for n in lanes]
if len(lanes) > 1:
header.append(tspec['agg_label'])
print('\t'.join(header), file=fh)
# One row per project
p_by_l_totals = [ 0 for l in lanes ]
if len(lanes) > 1:
p_by_l_totals.append(0)
for p in ps:
# Get the values for the row and tack on the sum
p_by_l = [ all_stats[tspec['key']].get('{}/{}'.format(l,p), 0) for l in lanes ]
if len(lanes) > 1:
if tspec['agg']:
p_by_l.append(tspec['agg'](p_by_l))
else:
# Must be pre-calculated
p_by_l.append(all_stats['P'+tspec['key']].get(p, 0))
row = [p_to_name.get(p, p)] + p_by_l
print('\t'.join(_str(x) for x in row), file=fh)
# Add the row to the running sum - NumPy would do this more neatly.
if tspec['agg']:
p_by_l_totals = [ tspec['agg'](x) for x in zip(p_by_l_totals, p_by_l) ]
if not tspec['agg']:
p_by_l_totals = [ all_stats['L'+tspec['key']].get(l, 0) for l in lanes ]
if len(p_by_l_totals) > 1:
p_by_l_totals.append( all_stats['L'+tspec['key']].get('All', 0) )
# And now print the row of totals at the bottom, if there was >1 project
if len(ps) > 1:
last_row = [ tspec['agg_label'] ] + p_by_l_totals
print('\t'.join(_str(x) for x in last_row), file=fh)
def extract_stats_from_json(json_files):
""" Go through all the JSON files and make a dict { lane -> { pool_library -> fragments} }
Somewhat similar to stats_json_aggregator.py.
"""
all_stats = dict()
for stats_file in json_files:
with open(stats_file) as fh: stats_json = json.load(fh)
# Add the conversion stats per lane
for cr_per_lane in stats_json['ConversionResults']:
ln = str(cr_per_lane["LaneNumber"])
assert ln not in all_stats, "Lane {} seen already.".format(ln)
all_stats[ln] = { r["SampleId"]: r["NumberReads"] for r in cr_per_lane["DemuxResults"] }
return all_stats
if __name__=="__main__":
main(parse_args())