-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsample_names_fetch.py
executable file
·207 lines (168 loc) · 7.45 KB
/
sample_names_fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/env python3
import os, sys, re
import ast
import logging as L
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import csv
import shutil
from collections import Counter
from hesiod import parse_cell_name, glob, dump_yaml
def main(args):
L.basicConfig(level=(L.DEBUG if args.debug else L.WARNING))
# Turns r'\t' into '\t' the fancy way
delim = ast.literal_eval(f"'{args.delim}'")
experiment = args.experiment or os.path.basename(os.path.abspath('.'))
cell, = args.cell
if args.find:
# In this case, just find the file and quit
if cell.endswith('.tsv'):
# Oh, it's a filename not a cell name
tsv_file = cell if os.path.exists(cell) else None
else:
tsv_file = find_tsv(experiment, cell, args.tsvdir)
if tsv_file:
print(tsv_file)
exit(0)
else:
exit(1)
# Get the YAML which might be a sample list or might be an error.
if cell.endswith('.tsv'):
# Try to load the file directly. This is really for debugging.
tsv_file = cell
info_dict = parse_tsv(tsv_file, delim=delim)
info_dict['file'] = os.path.abspath(tsv_file)
else:
info_dict = get_info_main(experiment, cell, args.tsvdir, delim)
if args.print:
# Print and done
print(dump_yaml(info_dict), end='')
else:
# Copy the TSV file to the cell dir, and save the YAML.
# At this point, the cell dir must exist
if 'error' not in info_dict:
orig_filename = os.path.basename(info_dict['file'])
shutil.copyfile(info_dict['file'], f"{cell}/{orig_filename}")
L.debug(f"Saving out {cell}/sample_names.yaml")
dump_yaml(info_dict, filename=f"{cell}/sample_names.yaml")
def get_info_main(experiment, cell, dir, delim):
error = None
# See if we can find a sample names file for this cell
if not os.path.isdir(dir):
error = f"No such directory {dir}"
else:
tsv_file = find_tsv(experiment, cell, dir)
if not tsv_file:
error = f"No suitable TSV file found in {dir}"
if error:
return dict( error = error,
file = None )
# Well, we have a file
res = parse_tsv(tsv_file, delim=delim)
# Add the file and return
res['file'] = os.path.abspath(tsv_file)
return res
def find_tsv(experiment, cell, dir='.'):
"""Locate a sample names TSV file to use for this cell.
"""
L.debug(f"Looking for a TSV file in {os.path.abspath(dir)}")
parsed_cell = parse_cell_name(experiment, cell)
parsed_cell['CellBase'] = os.path.basename(parsed_cell['Cell'])
parsed_cell['ShortExpt'] = parsed_cell['Experiment'].split('_', 2)[-1]
# For a non-pooled flowcell the 'Pool' will be a library name.
# CellID is the flowcell ID and project is like 12345
candidate_tsv = [ f"{parsed_cell[x]}_sample_names.tsv" for x in
[ 'CellBase', 'Pool', 'CellID',
'Experiment', 'ShortExpt', 'Project' ] ]
# The rule is that we search dir/*.tsv and dir/*/*.tsv. Precedence is in the
# order of candidate_tsv. If there are multiple files with the same name
# the one in the top level takes precedence, then in alphabetical order. So...
all_tsv = glob(f"{dir}/*.tsv") + glob(f"{dir}/*/*.tsv")
L.debug(f"candidate tsv: {candidate_tsv}")
L.debug(f"files globbed: {all_tsv}")
for cand in candidate_tsv:
for f in all_tsv:
if os.path.basename(f) == cand:
return f
return None
def parse_tsv(filename, delim="\t"):
error = None
codes = []
try:
with open(filename, newline='') as csvfile:
tsvreader = csv.reader(csvfile, delimiter=delim)
for n, row in enumerate(tsvreader):
# If the row does not split neatly, try a basic split on spaces
if len(row) < 3:
row = ' '.join(row).split()
# Blank rows are ignored.
if not row:
continue
# Tolerate some typos in the word "barcode"
mo = re.fullmatch(r'[Bb][arcode]{5,7}(\d\d)', row[0])
if mo:
row[0] = f"barcode{mo.group(1)}"
else:
if n == 0:
# Header row does not need to be a barcode
continue
else:
# Other rows do need to be a barcode
error = f"Unable to parse barcode on line {n+1}"
break
if len(row) == 1:
error = f"Missing internal name for {row[0]}"
break
if not re.fullmatch(r'\d{5}[A-Z]{2}\w*', row[1]):
error = f"Invalid internal name for {row[0]}: {row[1]!r}"
break
codes.append( dict( bc = row[0],
int_name = row[1],
ext_name = ' '.join(row[2:]).strip() or None ) )
except OSError as e:
# Catches exceptions where the file cannot be read
error = str(e)
# OK we gottem. Now some sanity checks
if not error:
if not codes:
error = "No barcodes found in the file"
else:
rep_bc, = Counter([ c['bc'] for c in codes ]).most_common(1)
rep_id, = Counter([ c['int_name'] for c in codes ]).most_common(1)
if rep_bc[1] > 1:
error = f"Repeated barcode {rep_bc[0]}"
elif rep_id[1] > 1:
error = f"Repeated internal name {rep_id[0]}"
if not error:
# Yay all tests passed
return dict( barcodes = codes )
else:
return dict( error = error )
def parse_args(*args):
description = """Finds an appropriate sample_names.tsv for a given cell.
Copies the file to the cell directory, and also makes
a sample_names.yaml with the information in YAML format,
or an error if finding or parsing the file fails.
"""
epilog = """The env var SAMPLE_NAMES_DIR can be set to override the default
TSVDIR setting.
"""
argparser = ArgumentParser( description = description,
epilog = epilog,
formatter_class = ArgumentDefaultsHelpFormatter )
argparser.add_argument("cell", nargs=1,
help="The cell to find samples for.")
argparser.add_argument("--experiment",
help="Name of experiment. Defaults to basename of CWD.")
argparser.add_argument("-t", "--tsvdir", default=os.environ.get("SAMPLE_NAMES_DIR", '.'),
help="Directory to search for candidate TSV files.")
argparser.add_argument("--delim", default="\\t",
help="Directory to search for candidate TSV files.")
argparser.add_argument("--find", action="store_true",
help="Find and print the TVS filename then quit.")
argparser.add_argument("--print", action="store_true",
help="Print the YAML but do not save any files.")
argparser.add_argument("-d", "--debug", action="store_true",
help="Print more verbose debugging messages.")
return argparser.parse_args(*args)
if __name__ == "__main__":
main(parse_args())