-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsummarize_for_overview.py
executable file
·170 lines (139 loc) · 6.77 KB
/
summarize_for_overview.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python3
import os, sys
from collections import defaultdict, OrderedDict
from datetime import datetime
import yaml, yamlloader
from urllib.parse import quote as url_quote
from illuminatus import illuminatus_version
from illuminatus.Formatters import fmt_time, fmt_duration
"""This script provides information about a sequencing run that appears at the top
of each MultiQC report page.
Most of the info comes from pipeline/sample_summary.yml that is created by
summarize_lane_contents.py (as run by driver.sh).
In addition it will look at:
RTAComplete.txt (timestamp only)
SampleSheet.csv (to generate full path based on current directory)
start_times file in pipeline folder
"""
def get_pipeline_info(run_path):
"""If the pipeline started actually demultiplexing we can get some extra bits of info
The pipeline/start_times file contains the start time, as well as the version, and extra lines are
added on each redo. It's written out directly by driver.sh just before it first triggers this script
(to update the report prior to running Snakefile.demux)
"""
pipeline_info = dict()
try:
with open(os.path.join( run_path , 'pipeline', 'start_times')) as stfh:
last_line = list(stfh)[-1].rstrip('\n')
except FileNotFoundError:
#OK, the pipeline didn't start
return None
try:
pipeline_info['version'], start = last_line.split('@', 1)
pipeline_info['_pl_start'] = int(start)
pipeline_info['pl_start'] = fmt_time(pipeline_info['_pl_start'])
except Exception:
# Tolerate junk in this file
pipeline_info.setdefault('version', 'unknown')
pipeline_info.setdefault('plstart', 'unknown')
# Now if this script belongs to a different version we need to say so, and we
# end up with a version like 0.0.3+0.1.0, ie. demultiplexed with one version and
# QC'd with another. This may well be fine, but redo the run from scratch if you need to
# ensure consistency.
if illuminatus_version != pipeline_info['version']:
pipeline_info['version'] += "+" + illuminatus_version
# If the pipeline started, the sequencer MUST have finished.
touch_file = os.path.join( run_path , 'RTAComplete.txt' )
pipeline_info['_seq_finish'] = os.stat(touch_file).st_mtime
pipeline_info['seq_finish'] = fmt_time( pipeline_info['_seq_finish'] )
return pipeline_info
def wrangle_experiment_name(rids):
"""Returns a 2-item list [label, link] for the Experiment Name, or just a name
if there is no link to make.
"""
expname_from_xml = rids.get('ExperimentName')
expname_from_ss = rids.get('ExperimentSS')
if not expname_from_xml:
# No linky??
if expname_from_ss:
# consistent with summarize_lane_contents.py
return expname_from_ss or 'unknown ({})'.format(expname_from_ss)
else:
return 'unknown'
else:
# Hyperlink the expt name to BaseSpace. We can't do this directly since there is an unpredictable
# number in the URL but this will do. This always uses the expname_from_xml value.
linky = "https://basespace.illumina.com/search/?type=run&query=" + url_quote(expname_from_xml)
if expname_from_ss and expname_from_ss != expname_from_xml:
# Name conflict
return [ "[{}] ({})".format(expname_from_xml, expname_from_ss), linky ]
else:
# All consistent, or expname_from_ss is just missing
return [ expname_from_xml, linky ]
def get_idict(rids, run_path, pipeline_info=None):
"""Reformat the rids data into what we want for the overview, adding a few extra
bits.
"""
# Note that we can sometimes get the flowcell type from the params (NovaSeq) and
# otherwise from the info (everything else).
idict = dict()
# Funny business with the experiment name - we have a 2-item list of [name, link]
expname = wrangle_experiment_name(rids)
ss_path = os.path.join( run_path , 'SampleSheet.csv' )
sample_sheet = [ os.path.basename(os.path.realpath( ss_path )),
os.path.realpath( ss_path ) ]
idict['pre_start_info'] = OrderedDict([
('Pipeline Version', illuminatus_version),
('Run Date', rids['RunDate']),
('Run ID', rids['RunId']),
('Experiment', expname),
('Instrument', rids['Instrument']),
('Flowcell Type', rids.get('FCType') or 'unknown'),
('Chemistry', rids.get('Chemistry')), # May be None
('LaneCount', int(rids['LaneCount'])), # MultiQC treats this specially
('Cycles', rids['Cycles']), # '251 [12] 251',
('Pipeline Script', get_pipeline_script()),
('Sample Sheet', sample_sheet), # [ name, path ]
('t1//Run Start', rids['RunStartTime']),
])
# Eliminate empty value.
if not idict['pre_start_info']['Chemistry']:
del idict['pre_start_info']['Chemistry']
if pipeline_info:
# We'll get the definitive pipeline version from pipeline_info
del idict['pre_start_info']['Pipeline Version']
if '_seq_finish' in pipeline_info and 'RunStartTimeStamp' in rids:
# We can work out how long the run took
run_duration = fmt_duration( rids['RunStartTimeStamp'],
pipeline_info['_seq_finish'] )
seq_finish_suffix = f" ({run_duration})"
else:
seq_finish_suffix = ""
idict['post_start_info'] = OrderedDict([
('Pipeline Version', pipeline_info['version']),
('t2//Sequencer Finish',
f"{pipeline_info['seq_finish']}{seq_finish_suffix}"),
('t3//Pipeline Start', pipeline_info['pl_start']),
('Pipeline Start Timestamp', pipeline_info['_pl_start']),
])
return idict
def get_pipeline_script():
"""Presumably this. Note we also report the pipeline version(s) in post_start_info
"""
return os.path.realpath(os.path.dirname(__file__)) + '/driver.sh'
def main(run_folder='.'):
# Most of what we care about is in pipeline/sample_summary.yml, which is unordered.
# rids == Run Info Data Structure
with open(os.path.join(run_folder, 'pipeline', 'sample_summary.yml')) as rfh:
rids = yaml.safe_load(rfh)
# If the pipeline has actually run (demultiplexed) there will be some info about that
pipeline_info = get_pipeline_info(run_folder)
# We format everything into an OrderedDict
idict = get_idict(rids, run_folder, pipeline_info)
# And print it
print( yaml.dump( idict,
Dumper = yamlloader.ordereddict.CSafeDumper,
default_flow_style = False ), end='' )
if __name__ == '__main__':
#If no run specified, examine the CWD.
main(*sys.argv[1:])