-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpipeline_util.py
270 lines (213 loc) · 8.92 KB
/
pipeline_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import os
import re
from glob import glob
####################
# Fastq processing #
####################
def is_control(c):
return re.match('.*(input|control|wce).*', re.sub('.*/', '', str(c)), flags=re.IGNORECASE) is not None
def control_not_required(c):
return re.match('.*(atac|dnase|dhs).*', re.sub('.*/', '', str(c)), flags=re.IGNORECASE) is not None
def _fastq_paths(fastq_dir, fastq_ext):
return list(glob(os.path.join(fastq_dir, f'*.{fastq_ext}')))
def _bams_paths(bams_dir):
return list(glob(os.path.join(bams_dir, '*.bam')))
def fastq_names_wo_ext(fastq_paths):
# file name w/o ext and parent folders: supports *.fastq and *.fastq.gz
return [_split_to_fname_and_ext(f)[0] for f in fastq_paths]
def aligned_names(config, fastq_paths, bams_paths):
if not bool(config['start_with_bams']):
return fastq_aligned_names(config, fastq_paths)
# file name w/o ext and parent folders: supports *.fastq and *.fastq.gz
return [_split_to_fname_and_ext(f)[0] for f in bams_paths]
def fastq_aligned_names(config, fastq_paths):
return _paired_fastq_samples_names(config, fastq_paths) + _single_fastq_samples_names(config, fastq_paths)
def _split_to_fname_and_ext(path):
# assumes file has valid ext
# understands *.{ext} and *.{ext}.gz
fname = os.path.basename(path)
name, dot_ext = os.path.splitext(fname)
if dot_ext == ".gz":
name2, dot_ext2 = os.path.splitext(name)
# remove first dot from ext:
return name2, dot_ext2[1:] + dot_ext
else:
# remove first dot from ext:
return name, dot_ext[1:]
def _sample_by_file(config, fq_path):
name = _split_to_fname_and_ext(fq_path)[0]
if not bool(config['fastq_single_end_only']) and re.match('.*_R?[12]_?$', name):
# paired file
return re.sub('_R?[12]_?$', '', name)
else:
# single end file
return name
def _single_fastq_samples_names(config, fastq_paths):
if bool(config['fastq_single_end_only']):
return fastq_names_wo_ext(fastq_paths)
paired_samples = _paired_fastq_samples_names(config, fastq_paths)
return [name for name in fastq_names_wo_ext(fastq_paths)
if not re.match('.*_R?[12]_?$', name) or re.sub('_R?[12]_?$', '', name) not in paired_samples]
def _paired_fastq_samples_names(config, fastq_paths):
if bool(config['fastq_single_end_only']):
return []
fq_names = set(fastq_names_wo_ext(fastq_paths))
result = []
for name in fq_names:
if re.match('.*_R?[12]_?$', name):
sample = re.sub('_R?[12]_?$', '', name)
suffix = name.replace(sample, '').replace('1', '2')
if f'{sample}{suffix}' in fq_names:
result.append(sample)
return result
def _get_paired_suffixes(config):
# we assume that all the files share similar naming
for f in sorted(glob(os.path.join(config['fastq_dir'], '*.' + config['fastq_ext']))):
name = _split_to_fname_and_ext(f)[0]
if re.match('.*_R?1_?$', name):
sample = re.sub('_R?1_?$', '', name)
suffix1 = name.replace(sample, '')
suffix2 = suffix1.replace('1', '2')
if suffix1 and suffix2 and suffix1 != suffix2 and os.path.exists(f.replace(suffix1, suffix2)):
return suffix1, suffix2
return '_1', '_2'
def bowtie2_input_paths(config, paired):
if bool(config['trim_reads']):
if paired:
suffix1, suffix2 = _get_paired_suffixes(config)
return [
f"trimmed/{{sample}}{suffix1}_trimmed.{config['fastq_ext']}",
f"trimmed/{{sample}}{suffix2}_trimmed.{config['fastq_ext']}"
]
else:
return [
f"trimmed/{{sample}}_trimmed.{config['fastq_ext']}",
]
else:
if paired:
suffix1, suffix2 = _get_paired_suffixes(config)
return [
config['fastq_dir'] + f"/{{sample}}{suffix1}.{config['fastq_ext']}",
config['fastq_dir'] + f"/{{sample}}{suffix2}.{config['fastq_ext']}",
]
else:
return [
config['fastq_dir'] + f"/{{sample}}.{config['fastq_ext']}"
]
def trimmed_fastq_sample_names(fastq_paths):
# here `name` could have _1 and _2 suffix in case of paired reads
return [f"{name}_trimmed" for name in fastq_names_wo_ext(fastq_paths)]
#################
# Control reads #
#################
def _sample_2_control(config, fastq_paths, bams_paths):
result = {}
paths = fastq_paths if not bool(config['start_with_bams']) else bams_paths
for path in paths:
ext = _split_to_fname_and_ext(path)[1]
control_path = find_control_for(path, ext)
control_sample = _sample_by_file(config, control_path) if control_path else None
if control_sample is not None:
result[_sample_by_file(config, path)] = control_sample
return result
def find_control_for(file, ext="bam"):
if is_control(file) or control_not_required(file):
return None
# Find all the files within folder
controls = [os.path.basename(n) for n in glob(f'{os.path.dirname(file)}/*.{ext}') if is_control(n)]
return find_matching_control(os.path.basename(file), controls)
def find_matching_control(bam_name, controls):
if len(controls) == 0:
return ''
# Compute lcs for _ and - separated names, otherwise compute for all symbols
parts_bam = re.split('[\\/\-_\s\.]+', str(bam_name.lower()))
parts_controls = [re.split('[\\/\-_\s\.]+', c.lower()) for c in controls]
match_parts = [_lcs(parts_bam, pc) for pc in parts_controls]
match_scores = [_score_matching_parts(m) for m in match_parts]
max_score_items = [i for i in range(len(match_scores)) if match_scores[i] == max(match_scores)]
if len(max_score_items) == 1:
return controls[max_score_items[0]]
# Otherwise return most similar for string
return max(controls, key=lambda x: _score_matching_parts(_lcs(str(bam_name.lower()), x.lower())))
def _score_matching_parts(matches):
# We want the avangard parts to be more important
return len(matches) + sum(1 / (s + 1) for _, s in matches)
def _lcs(x, y):
"""
Finds longest common subsequence
Code adopted from https://en.wikibooks.org/wiki/Algorithm_Implementation/
Strings/Longest_common_subsequence#Python
"""
m = len(x)
n = len(y)
# An (m+1) times (n+1) matrix
c = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if x[i - 1] == y[j - 1]:
c[i][j] = c[i - 1][j - 1] + 1
else:
c[i][j] = max(c[i][j - 1], c[i - 1][j])
def _lcs_backtrack(i, j):
if i == 0 or j == 0:
return []
elif x[i - 1] == y[j - 1]:
return _lcs_backtrack(i - 1, j - 1) + [(x[i - 1], i - 1)]
else:
if c[i][j - 1] > c[i - 1][j]:
return _lcs_backtrack(i, j - 1)
else:
return _lcs_backtrack(i - 1, j)
return _lcs_backtrack(m, n)
####################
# MACS2 parameters #
####################
def effective_genome_fraction(genome, chrom_sizes_path, pileup_bed):
"""From MACS2 documentation:
The default hs 2.7e9 is recommended for UCSC human hg18 assembly.
Here are all precompiled parameters for effective genome size:
hs: 2.7e9
mm: 1.87e9
ce: 9e7
dm: 1.2e8"""
# Get chr names covered with reads (e.g. if data is filtered by chromosome name
# or some chrs excluded during alignment
chromosomes = set()
with open(str(pileup_bed)) as f:
for line in f:
chr = line.split()[0]
chromosomes.add(chr)
# Sized of chromosomes covered with reads
chrom_sizes = {}
with open(str(chrom_sizes_path)) as f:
for line in f:
chromosome, size = line.split()
chrom_sizes[chromosome] = int(size)
# Normalization if not all genome chromosomes are covered
chromosomes_length = sum([chrom_sizes.get(c, 0) for c in chromosomes])
genome_length = sum(chrom_sizes.values())
if genome.startswith('mm'):
size = 1.87e9
elif genome.startswith('hg'):
size = 2.7e9
else:
raise Exception('Unknown species {}'.format(genome))
return (size / genome_length) * (1.0 * chromosomes_length / genome_length)
def macs_species(genome):
"""Convert genome to macs2 species encoding"""
if re.match('^hg[0-9]+$', genome):
return 'hs'
elif re.match('^mm[0-9]+$', genome):
return 'mm'
raise Exception('Unknown species {}'.format(genome))
# Small test
if __name__ == '__main__':
print(find_matching_control('GSM646340_H1_H3K36me3_rep2.bam', [
'GSM646390_HMEC_Input_rep2.bam',
'GSM646351_H1_Input_rep1.bam',
'GSM646352_H1_Input_rep2.bam',
]))
print(find_matching_control('CD4_H3K27ac_rep1_hg38_ENCFF732DAD.bam', [
'CD4_Control_hg38_ENCFF140FVH.bam',
'CD4ABT_Control_rep1_hg38_ENCFF178GLM.bam',
]))