-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpreprocessing.bash
executable file
·732 lines (667 loc) · 25.8 KB
/
preprocessing.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
#!/bin/bash
# Preprocessing as per Section "Prediction region selection" in Genome Research manuscript [10.1101/gr.277677.123].
# Steps are as in publication. Stages are for debugging.
#
# Input: threshold t or "median", R replicate BAM files, one merged BAM file, size region L
#
## Stage 0: Create merged BAM file if it does not exist
## Stage 1 (-bga.bedGraph): Compute replicate and merged file coverage.
## Stage 2 (_median.txt): Compute threshold if user requests "median".
# - For each replicate, compute median of nonzero coverage values on each chromsome. Minimum across replicates is threshold t_c for chromosome c.
## Stage 3 (regions1)
# Step 1:
# - Retain genome positions with coverage >t_c in all R individual BAM files.
## Stage 4 (regions2)
# Step 2:
# - Merge retained sites within 90 bp.
# - Retain all regions longer than 100 bp
## Stage 5 (regions3)
# Step 3.1: Extend regions short to L.
# - Extending (L/2) bp up- and down-stream of the midpoint.
# Step 3.2: Split regions larger than L.
# - Find positions with coverage summed across replicates $\geq 0.95$ quantile of the region (obtained from the merged BAM file).
# - Positions within L bp are merged
# - Extended (L/2) bp up- and down-stream of each merged region's midpoint
## Stage 6 (regions4)
# - Remove blacklist regions.
## Stage 7 (covBga.txt): Get coverage of selected regions.
## Stage 8 (bigInputs.txt): Get biginput regions (the original regions)
last_stage=10
### Process command line.
## default values
pdir="."
indir="example"
outdir=""
genome="hg38"
cutoff="median"
threads=1
prefix="out"
inputLn=1000
quiet=0
verbose=0
overwrite=0
zerowrite=0
save=0
ref_prefix="chr"
exclude="MATCHES_NOTHING_ON_PLANET_EARTH"
helpFunction()
{
echo ""
echo "Usage: $0 -d INPUT_DIR -b \"BAM_REP1 BAM_REP2[ BAM_REP3...]\" -n NAME"
echo -e "\t-b STR BAM files of replicate experiments. Provide as space-separated strings surround by double quotes."
echo -e "\t For example: -b \"MCF7_chr10_rep1.bam MCF7_chr10_rep2.bam\""
#\n\t Must be sorted and indexed, though if index is missing, we will index for you."
echo -e "\t-c STR Cutoff for coverage. Either \"median\" or integer (DEFAULT: cutoff)."
echo -e "\t-d STR Input directory where BAM and BAI input files are stored (DEFAULT: $indir)."
echo -e "\t-e STR Exclude reference sequence names matching grep extended regexp (DEFAULT: not set)."
echo -e "\t For example, set to \"_\" to get rid of unplaced scaffolds."
echo -e "\t-g STR Genome the data are aligned to. Currently support mm10 (Ensembl), hg38 (Ensembl), or any unique id for another genome (DEFAULT: $genome)."
echo -e "\t-L INT Length of RCL input segments (DEFAULT: $inputLn)."
echo -e "\t-m STR Provide the merged BAM file if you have already created it. (DEFAULT: not named)"
echo -e "\t-n STR File name prefix (DEFAULT: $prefix)."
echo -e "\t-o STR Output directory where output files will go (DEFAULT: same as input directory)."
echo -e "\t-p STR Program directory where preprocessing scripts are located (DEFAULT: $pdir)."
echo -e "\t-t INT Number of threads to use (DEFAULT: $threads)."
echo -e "\t-q Quiet, no output of progress (DEFAULT: no)."
echo -e "\t-r STR Reference sequence name prefix (DEFAULT: $ref_prefix)."
echo -e "\t-s Save intermediate files (DEFAULT: no)."
echo -e "\t-v Verbose debugging output (DEFAULT: no)."
echo -e "\t-w INT Overwrite existing files from stage INT (DEFAULT: no)."
echo -e "\t Stage 1: Coverage bedgraphs per replicate, merged per reference sequence (*-bga.bedGraph)."
echo -e "\t Stage 2: Median coverage per replicate/reference sequence (*_median.txt)."
echo -e "\t Stage 3: Sites exceeding threshold (*-regions1.txt: reference, start, end, coverage)."
echo -e "\t Stage 4: Candidate regions (*-regions2.txt: ref, reg_start, reg_end, reg_name, ref, cov_start, cov_end, sum_coverage)."
echo -e "\t Stage 5: Bed file of initial L-length segments (step 3) (*-regions3.txt)."
echo -e "\t Stage 6: Bed file of blacklist filtered L-length segments (*-regions4.txt)."
echo -e "\t Stage 7: Bedgraph of candidate segments (*covBga.txt)."
echo -e "\t Stage 8: Final candidate regions (*bigInputs.txt)."
echo -e "\t-z INT Overwrite only zero-length existing files from stage INT (DEFAULT: no)."
echo -e "\t-? This help."
exit 1 # Exit script after printing help
}
stop=0
## read command line
while getopts "?12345678qsvb:c:d:e:g:L:m:n:p:r:o:t:w:z:" opt
do
case "$opt" in
b) indivBam="$OPTARG";;
c) cutoff="$OPTARG";;
d) indir="$OPTARG";;
e) exclude="$OPTARG";;
g) genome="$OPTARG";;
L) inputLn="$OPTARG";;
m) mergedBam="$OPTARG";;
n) prefix="$OPTARG";;
o) outdir="$OPTARG";;
p) pdir="$OPTARG";;
q) quiet=1;;
r) ref_prefix="$OPTARG";;
s) save=1;;
t) threads="$OPTARG";;
v) verbose=1;;
w) overwrite=$OPTARG;;
z) zerowrite=$OPTARG;;
1) stop=1;;
2) stop=2;;
3) stop=3;;
4) stop=4;;
5) stop=5;;
6) stop=6;;
7) stop=7;;
8) stop=8;;
?) helpFunction ;; # Print helpFunction in case parameter is non-existent
esac
done
if [ $quiet -eq 1 ]; then
verbose=0
fi
if [ $overwrite -eq 0 ]; then
overwrite=$((last_stage+1))
fi
## reset defaults based on user input
indir=${indir%/}
if [ $verbose -eq 1 ]; then echo "[STAGE 0] ($0:${LINENO}) Setting input directory to \"$indir\"."; fi
if [ -z "$outdir" ]; then
outdir=$indir
if [ $quiet -eq 0 ]; then echo "[STAGE 0] ($0:${LINENO}) Setting output directory to \"$outdir.\""; fi
fi
if [[ $genome == "hg38" ]]; then
genome="hg"
fi
if [[ $genome == "mm10" ]]; then
genome="mm"
fi
basename=""
## guess the merged bam file name
if [ -z $mergedBam ]; then
for rep in $indivBam; do
mergedBam=${rep/_rep1.bam/_merged.bam}
if [[ "$mergedBam" == "$rep" ]]; then
mergedBam=${rep/.bam/_merged.bam}
basename=${rep/.bam/}
else
basename=${rep/_rep1.bam/}
fi
if [ $quiet -eq 0 ]; then echo "[STAGE 0] ($0:${LINENO}) Guessing the merged BAM file is \"$indir/$mergedBam\"."; fi
break
done
fi
user_prefix=$prefix # user's prefix
if [ ! -z $basename ]; then
prefix="$prefix"_"$basename"
fi
## check for missing user input and help user
if [ -z "$pdir" ] || [ -z "$indir" ] || [ -z "$outdir" ] || [ -z "$genome" ] || [ -z "$cutoff" ] || #|| [ -z "$mergedBam" ]
[ -z "$indivBam" ] || [ -z "$threads" ] || [ -z "$prefix" ] || [ -z "$inputLn" ] #|| [ -z "$skip" ]
then
echo "[ERROR] ($0:${LINENO}) Some or all of the parameters are empty";
helpFunction
fi
### Stage 0 Create any missing files.
## sort and index individual BAM files if they are not yet indexed
nindivBam=""
for bam in $indivBam; do
# BAM file missing
if [ ! -s "$indir/$bam" ]; then
echo "[ERROR] BAM file $indir/$bam not found or empty."
cnt=0
for bam in $indivBam; do
cnt=$((cnt+1))
done
echo -e "[INFO] You named $cnt BAM files \"$indivBam\" on your command line.\n[INFO] There can be no spaces in the BAM file names."
exit 1
fi
# create missing BAM file index
if [ ! -s "$indir/${bam}.bai" ]; then
if [[ "$bam" =~ ".*\.bam" ]]; then
echo "[ERROR] BAM file $indir/$bam must use extension .bam."
exit 1
fi
echo "[STAGE 0] ($0:${LINENO}) Detected no index for $indir/$bam..."
sorted=`samtools view -H "$indir"/$bam | grep SO:coordinate`
if [ -z "$sorted" ]; then
obam=${bam/.bam/.srt.bam}
if [ $quiet -eq 0 ]; then echo "[STAGE 0] ($0:${LINENO}) Sorting $indir/$bam and writing to $indir/$obam..."; fi
if [ $verbose -eq 1 ]; then set -x; fi
samtools sort -@ $threads -o "$indir"/"$obam" "$indir"/"$bam"
set +x
else
obam=$bam
fi
if [ $quiet -eq 0 ]; then echo "[STAGE 0] ($0:${LINENO}) Indexing $indir/$obam..."; fi
if [ $verbose -eq 1 ]; then set -x; fi
samtools index -@ $threads "$indir"/"$obam"
set +x
if [ -z "$nindivBam" ]; then
nindivBam="$obam"
else
nindivBam="$nindivBam $obam"
fi
else # this one does have an index!
if [ -z "$nindivBam" ]; then
nindivBam="$bam"
else
nindivBam="$nindivBam $bam"
fi
fi
done
# overwrite BAM file list
if [ ! -z "$nindivBam" ]; then
if [[ "$nindivBam" =~ ".*\.srt\.bam" ]]; then
echo "[WARNING] We wrote sorted BAM files \"$nindivBam\". You should use these files instead of \"$indivBam\" in future runs of this script."
fi
indivBam=$nindivBam
fi
## create and index merged BAM file if it does not exist
if [ ! -s "$indir"/"$mergedBam" ]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 0] ($0:${LINENO}) Creating merged bam file $indir/$mergedBam..."; fi
if [ $verbose -eq 1 ]; then set -x; fi
pushd "$indir" > /dev/null; samtools merge -@ $threads "$mergedBam" $indivBam; popd > /dev/null
set +x
fi
if [ ! -s "$indir"/"$mergedBam".bai ]; then
if [ $verbose -eq 1 ]; then set -x; fi
samtools index -@ $threads "$indir"/"$mergedBam"
set +x
fi
### STAGE 1: get necessary coverage files (Step 0)
## Step 0.0: read chromosome names
if [ $quiet -eq 0 ]; then echo -n "[STAGE 0] ($0:${LINENO}) References sequences found in mergedBam file: "; fi
if [ $verbose -eq 1 ]; then set -x; fi
chrList=`samtools view -H "$indir"/"$mergedBam" | grep "SN:" | cut -f 2 | grep -v -E $exclude | sed 's/SN://g' | tr "\n" " "`
set +x
chrList=${chrList% }
if [ $quiet -eq 0 ]; then echo $chrList; fi
## Step 0.1: get bga genomecov of the data for each chromosome (-bga.bedGraph)
if [ $quiet -eq 0 ]; then echo "[STAGE 1] ($0:${LINENO}) Getting coverage data..."; fi
# first for merged BAM file
nchrList=""
for chr in $chrList
do
OUT="$outdir"/$ref_prefix"$chr"/"$prefix"_"$chr"_merged-bga.bedGraph
if [[ ! -e "$OUT" || $overwrite -le 1 || ( $zerowrite -le 1 && ! -s "$OUT") ]]; then
# check for reads aligned to this chromosome
nreads=`samtools view "$indir"/"$mergedBam" $chr | head | wc --lines`
if [ $nreads -gt 0 ]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 1] ($0:${LINENO}) Writing merged coverage for reference sequence $ref_prefix$chr to file \"$OUT\"."; fi
if [ $verbose -eq 1 ]; then set -x; fi
mkdir --parents "$outdir"/$ref_prefix$chr
samtools view -hb -@ $threads "$indir"/"$mergedBam" $chr |
bedtools genomecov -ibam - -pc -bga |
grep -w "^$chr" \
> "$OUT"
set +x
if [ ! -s "$OUT" ]; then
echo "[ERROR] Failed to create \"$OUT\" with coverage data."
exit 1
fi
if [ -z "$nchrList" ]; then
nchrList="$chr"
else
nchrList="$nchrList $chr"
fi
elif [ $quiet -eq 0 ]; then
echo "[STAGE 1] ($0:${LINENO}) No reads aligned to reference sequence $chr in \"$indir/$mergedBam\"."
fi
else
if [ $quiet -eq 0 ]; then echo "[STAGE 1] ($0:${LINENO}) Using existing coverage file $OUT"; fi
if [ -z "$nchrList" ]; then
nchrList="$chr"
else
nchrList="$nchrList $chr"
fi
fi
done
if [ -z "$nchrList" ]; then
echo "[ERROR] There were no aligned reads in $indir/$mergedBam"
exit 1
else
if [ $verbose -eq 1 ]; then echo "[STAGE 1] ($0:${LINENO}) Resetting list of reference names to: $nchrList"; fi
chrList=$nchrList
fi
# then for individual BAM file
for ifile in $indivBam
do
ofile=${ifile/.bam/}
OUT="$outdir"/"$user_prefix"_"$ofile"-bga.bedGraph
if [[ ! -e "$OUT" || $overwrite -le 1 || ( $zerowrite -le 1 && ! -s "$OUT") ]]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 1] ($0:${LINENO}) Writing coverage in replicate $ifile to \"$OUT\"."; fi
if [ $verbose -eq 1 ]; then set -x; fi
bedtools genomecov -ibam "$indir"/"$ifile" -pc -bga > "$OUT"
set +x
if [ ! -s "$OUT" ]; then
echo "[ERROR] Failed to create \"$OUT\" with coverage data."
exit 1
fi
else
if [ $quiet -eq 0 ]; then echo "[STAGE 1] ($0:${LINENO}) Using existing coverage file \"$OUT\""; fi
fi
done
if [ $stop -eq 1 ]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 1] ($0:${LINENO}) Exiting after stage 1."; fi
exit 1;
fi
## Step 0.2: compute medians for each chromosome, each replicate
OUT="$outdir"/"$prefix"_median.txt
if [[ ! -e "$OUT" || $overwrite -le 2 || ( $zerowrite -le 2 && ! -s "$OUT") ]]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 2] ($0:${LINENO}) Computing coverage threshold(s)..."; fi
if [[ $cutoff == "median" ]]
then
if [ -s "$OUT" ]; then rm "$OUT"; fi # since $pdir/getMedian.bash appends, need to truncate first
for bam in $indivBam
do
obam=${bam/.bam/}
if [ $verbose -eq 1 ]; then set -x; fi
bash "$pdir"/getMedian.bash "$chrList" "$outdir"/"$user_prefix"_"$obam"-bga.bedGraph "$OUT" $threads
set +x
done
if [ ! -s "$OUT" ]; then
echo "[ERROR] Failed to create \"$OUT\" with median data."
exit 1
fi
fi
else
if [ $quiet -eq 0 ]; then echo "[STAGE 2] ($0:${LINENO}) Using existing median file \"$OUT\""; fi
fi
if [ $stop -eq 2 ]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 2] ($0:${LINENO}) Exiting after stage 2."; fi
exit 1;
fi
### Step 1: produce regions
## Step 1.2: get regions surpassing cutoff in every replicate (STAGE 3 regions1)
nchrList=""
for chr in $chrList
do
# check if file already exists
todo=0
for bam in $indivBam; do
obam=${bam/.bam/}
OUT="$outdir"/$ref_prefix"$chr"/"$user_prefix"_"$obam"_"$chr"-regions1.txt
if [[ ! -e "$OUT" || $overwrite -le 3 || ($zerowrite -le 3 && ! -s "$OUT" ) ]]; then
todo=1
break
else
if [ $quiet -eq 0 ]; then echo "[STAGE 3] ($0:${LINENO}) Using existing regions1 file \"$OUT\""; fi
fi
done
if [ $todo -eq 0 ]; then
if [ -z "$nchrList" ]; then nchrList="$chr"; else nchrList="$nchrList $chr"; fi
continue;
fi
# obtain (chromosome-specific) cutoff: minimum of medians already computed
if [[ $cutoff == "median" ]]; then
if [ $verbose -eq 1 ]; then set -x; fi
chr_cutoff=`grep -w "^$chr" "$outdir"/"$prefix"_median.txt | sort -k2,2n | cut -f 2 | head -n 1`
set +x
if [ $chr_cutoff -eq 0 ]; then
echo "[WARNING] Minimum median is 0 for reference sequence $chr. Skipping."
continue
fi
else
chr_cutoff=$cutoff
fi
# create bed file with selected regions
if [ $quiet -eq 0 ]; then echo "[STAGE 3] ($0:${LINENO}) Extracting regions with coverage >$chr_cutoff to file \"$outdir/$ref_prefix$chr/$user_prefix_*_$chr-regions1.txt\""; fi
if [ $verbose -eq 1 ]; then set -x; fi
echo $indivBam |
sed 's/.bam//g' |
sed 's/ /\n/g' |
parallel "samtools view -hb \"$indir\"/{}.bam \"$chr\" -@ $threads | bedtools genomecov -ibam - -pc -bga | grep -w \"^$chr\" | awk -v c=$chr_cutoff '{if(\$4>c){print}}' > \"$outdir\"/$ref_prefix\"$chr\"/\"$user_prefix\"_{}_\"$chr\"-regions1.txt"
# TODO: getAboveThreshold.bash script no longer needed
#parallel "bash \"$pdir\"/getAboveThreshold.bash \"$indir\"/{} \"$chr\" $chr_cutoff \"$outdir\"/$ref_prefix\"$chr\"/\"$user_prefix\"_{}_\"$chr\"-regions1.txt $threads"
set +x
chr_added=0
for bam in $indivBam; do
obam=${bam/.bam/}
OUT="$outdir"/$ref_prefix"$chr"/"$user_prefix"_"$obam"_"$chr"-regions1.txt
if [ ! -e "$OUT" ]; then
echo "[ERROR] Failed to create \"$OUT\" with median data."
exit 1
elif [ ! -s "$OUT" ]; then
echo "[WARNING] No regions found for replicate $obam, reference sequence $chr."
elif [ $chr_added -eq 0 ]; then
if [ -z "$nchrList" ]; then nchrList="$chr"; else nchrList="$nchrList $chr"; fi
chr_added=1
fi
done
#echo $indivBam | sed 's/ /\n/g' | parallel "bash \"$pdir\"/getAboveThreshold.bash \"$indir\"/{} \"$i\" \"$c\" \"$outdir\"/chr\"$i\"/\"$i\"_{}-temp.txt \"$threads\""
done
if [ -z "$nchrList" ]; then
echo "[ERROR] There are no reference sequences with median >0."
exit 1
else
chrList=$nchrList
fi
if [ $stop -eq 3 ]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 3] ($0:${LINENO}) Exiting after stage 3."; fi
exit 1
fi
## Step 2.1: (STAGE 4 regions2)
# get positions that pass the threshold in every replicate,
# merge if within 90 bp, keep if longer than 100 bp,
# then record merged coverage at every retained position for later analysis.
if [ $quiet -eq 0 ]; then echo "[STAGE 4] ($0:${LINENO}) Finding candidate regions (regions2)..."; fi
nchrList=""
for chr in $chrList
do
OUT="$outdir"/$ref_prefix"$chr"/"$prefix"_"$chr"-regions2.txt
if [[ ! -e "$OUT" || $overwrite -le 4 || ($zerowrite -le 4 && ! -s "$OUT") ]]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 4] ($0:${LINENO}) Writing initial candidate regions to file \"$OUT\""; fi
if [ $verbose -eq 1 ]; then set -x; fi
bedops --intersect "$outdir"/$ref_prefix"$chr"/"$user_prefix"_*_"$chr"-regions1.txt |
bedtools merge -d 90 -i - |
awk '{if ($3-$2 >= 100) {print}}' |
awk '{$(NF+1)="segment"NR}1' |
sed 's/ /\t/g' |
bedtools intersect -wa -wb -a - -b "$outdir"/$ref_prefix"$chr"/"$prefix"_"$chr"_merged-bga.bedGraph > "$OUT"
set +x
if [ ! -e "$OUT" ]; then
echo "[ERROR] Failed to create \"$OUT\" with region2 data."
exit 1
elif [ ! -s "$OUT" ]; then
echo "[WARNING] No surviving regions in \"$chr\"."
else
if [ -z "$nchrList" ]; then nchrList="$chr"; else nchrList="$nchrList $chr"; fi
fi
else
if [ $quiet -eq 0 ]; then echo "[STAGE 4] ($0:${LINENO}) Using existing regions2 file \"$OUT\"."; fi
if [ -z "$nchrList" ]; then nchrList="$chr"; else nchrList="$nchrList $chr"; fi
fi
done
if [ -z "$nchrList" ]; then
echo "[ERROR] There are no reference sequences with median >0."
exit 1
else
chrList=$nchrList
fi
if [ $stop -eq 4 ]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 4] ($0:${LINENO}) Exiting after stage 4."; fi
exit 1
fi
## Step 3.1 and 3.2: (STAGE 5 regions3)
# get $inputLn bp regions
if [ $quiet -eq 0 ]; then echo "[STAGE 5] ($0:${LINENO}) Extracting initial $inputLn bp segments from candidate regions (regions3)...."; fi
arr=($chrList)
m=${#arr[@]}
n=0
nchrList=""
while [ $n -lt $m ]; do
# gather next up to $threads reference names to process
subchrList=""
ncpu=0
while [ $n -lt $m -a $ncpu -lt $threads ]; do
chr=${arr[$n]}
OUT="$outdir"/$ref_prefix"$chr"/"$prefix"_"$chr"-regions3.txt
if [[ ! -e "$OUT" || $overwrite -le 5 || ($zerowrite -le 5 && ! -s "$OUT") ]]; then
if [ -z "$subchrList" ]; then
subchrList="$chr"
ncpu=$((ncpu+1))
else
subchrList="$subchrList $chr"
fi
else
if [ $quiet -eq 0 ]; then echo "[STAGE 5] ($0:${LINENO}) Using existing regions3 file \"$OUT\""; fi
if [ -z "$nchrList" ]; then nchrList="$chr"; else nchrList="$nchrList $chr"; fi
fi
n=$((n+1))
done
if [ $ncpu -eq 0 ]; then
continue
fi
# run next up to $threads reference names: add chr to reference name files
if [ $quiet -eq 0 ]; then echo "[STAGE 5] ($0:${LINENO}) Writing initial $inputLn-length segments to file \"$outdir/${ref_prefix}*/${prefix}_*-regions3.txt\""; fi
if [ $verbose -eq 1 ]; then set -x; fi
echo "$subchrList" |
sed 's/ /\n/g' |
parallel "Rscript --vanilla \"$pdir\"/getCountFiles.R \"$outdir\"/$ref_prefix{}/\"$prefix\"_{}-regions2.txt \"$inputLn\" $threads \"$outdir\"/$ref_prefix{}/\"$prefix\"_{}-regions3.txt"
set +x
for chr in $subchrList; do
OUT="$outdir"/$ref_prefix"$chr"/"$prefix"_"$chr"-regions3.txt
if [ ! -e "$OUT" ]; then
echo "[ERROR] Failed to write \"$OUT\" regions3 file."
exit 1
elif [ ! -s "$OUT" ]; then
echo "[WARNING] Reference sequence \"$ref_prefix$chr\" has no surviving regions in \"$OUT\"."
else
if [ -z "$nchrList" ]; then nchrList="$chr"; else nchrList="$nchrList $chr"; fi
fi
done
done
if [ -z "$nchrList" ]; then
echo "[ERROR] There are no reference sequences with median >0."
exit 1
else
chrList=$nchrList
fi
if [ $stop -eq 5 ]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 5] ($0:${LINENO}) Exiting after stage 5."; fi
exit 1
fi
## Step 3.3: remove blacklist regions (STAGE 6 regions4)
if [ $quiet -eq 0 ]; then echo "[STAGE 6] ($0:${LINENO}) Removing blacklist regions if possible..."; fi
if [[ $genome == "hg" ]]
then
echo "[WARNING] Blacklist regions are from Ensembl genome. This will not work if the genome used for alignment is from UCSC."
bl="$pdir/hg38-blacklist.v2.ensembl.bed"
elif [[ $genome == "mm" ]]
then
echo "[WARNING] Blacklist regions are from Ensembl genome. This will not work if the genome used for alignment is from UCSC."
bl="$pdir/mm10-blacklist.v2.ensembl.bed"
else
echo "[WARNING] Only human and mouse genome blacklists are supported for now. Blacklist regions are Ensembl genome. This will not work if the genome used for alignment is from UCSC. If not mouse or human, will skip removing blacklist regions."
fi
nchrList=""
for chr in $chrList
do
IN="$outdir"/$ref_prefix"$chr"/"$prefix"_"$chr"-regions3.txt
if [ ! -s $IN ]; then
echo "[ERROR] Failed to create \"$IN\" region3 data."
exit 1
fi
OUT="$outdir"/$ref_prefix"$chr"/"$prefix"_"$chr"-regions4.txt
if [[ ! -e "$OUT" || $overwrite -le 6 || ( $zerowrite -le 6 && ! -s "$OUT") ]]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 6] ($0:${LINENO}) Writing $inputLn-length without blacklist regions to file \"$OUT\""; fi
if [[ $genome == "hg" ]] || [[ $genome == "mm" ]]; then
if [ $verbose -eq 1 ]; then set -x; fi
if [[ "$chr" == chr* ]]; then
tail -n +2 "$IN" |
sed 's/^chr//' |
bedtools intersect -v -a - -b "$bl" |
sed 's/^/chr/' > "$OUT"
else
tail -n +2 "$IN" |
bedtools intersect -v -a - -b "$bl" > "$OUT"
fi
set +x
else
if [ $verbose -eq 1 ]; then set -x; fi
if [[ "$chr" == chr* ]]; then
tail -n +2 "$IN" > "$OUT"
else
tail -n +2 "$IN" |
sed 's/chr//g' > "$OUT"
fi
set +x
fi
elif [ $quiet -eq 0 ]; then
echo "[STAGE 6] ($0:${LINENO}) Using existing regions4 file \"$OUT\"."
fi
if [ ! -e "$OUT" ]; then
echo "[ERROR] Failed to create \"$OUT\" with region4 data."
exit 1
elif [ ! -s "$OUT" ]; then
echo "[WARNING] No surviving regions in \"$chr\" after intersection with blacklists."
else
if [ -z "$nchrList" ]; then nchrList="$chr"; else nchrList="$nchrList $chr"; fi
fi
done
if [ -z "$nchrList" ]; then
echo "[ERROR] There are no reference sequences with peaks outside blacklist regions."
exit 1
else
chrList=$nchrList
fi
if [ $stop -eq 6 ]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 6] ($0:${LINENO}) Exiting after stage 6."; fi
exit 1
fi
### Step 4: get counts (STAGE 7 *.covBga.txt)
if [ $quiet -eq 0 ]; then echo "[STAGE 7] ($0:${LINENO}) Extracting segment coverage per replicate..."; fi
declare -a rcl_input_files
arr=($chrList)
m=${#arr[@]}
for bam in $indivBam; do
obam=${bam/.bam/}
n=0
while [ $n -lt $m ]; do
subchrList=""
ncpu=0
while [ $n -lt $m -a $ncpu -lt $threads ]; do
chr=${arr[$n]}
OUT="$outdir"/$ref_prefix"$chr"/"$user_prefix"_"$obam"_"$chr".covBga.txt
rcl_input_files+=("$OUT")
if [[ $overwrite -le 7 || ! -s "$OUT" ]]; then # there should be no empty files at this stage
to_run=1
if [ -z "$subchrList" ]; then
subchrList="$chr"
else
subchrList="$subchrList $chr"
fi
ncpu=$((ncpu+1))
else
if [ $quiet -eq 0 ]; then echo "[STAGE 7] ($0:${LINENO}) Using existing final RCL input file \"$OUT\""; fi
fi
n=$((n+1))
done
if [ $ncpu -eq 0 ]; then
continue
fi
if [ $quiet -eq 0 ]; then echo -e "[STAGE 7] ($0:${LINENO}) Writing RCL $inputLn-length coverage files to file \"$outdir/${ref_prefix}*/${user_prefix}_${obam}_*.covBga.txt\" for reference sequences\n\t$subchrList"; fi
if [ $verbose -eq 1 ]; then set -x; fi
echo "$subchrList" |
sed 's/ /\n/g' |
parallel "bedtools intersect -wb -a \"$outdir\"/$ref_prefix{}/\"$prefix\"_{}-regions4.txt -b \"$outdir\"/\"$user_prefix\"_\"$obam\"-bga.bedGraph | cut -f1-4,8 > \"$outdir\"/$ref_prefix{}/\"$user_prefix\"_\"$obam\"_{}.covBga.txt"
set +x
#echo ${arr[n]} ${arr[n+1]} ${arr[n+2]} | sed 's/ /\n/g' | parallel "bedtools intersect -wb -a \"$outdir\"/chr\"$i\"/\"$i\"regions4.txt -b \"$outdir\"/\"$prefix\"_{}-bga.bedGraph | cut -f1-4,8 > \"$outdir\"/chr\"$i\"/\"$i\".\"$prefix\"_{}.covBga.txt"
done
done
if [ $stop -eq 7 ]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 7] ($0:${LINENO}) Exiting after stage 7."; fi
exit 1
fi
### Step 5: get big inputs (STAGE 8)
echo $chrList | sed 's/ /\n/g' > "$outdir"/chrList.txt
if [ $quiet -eq 0 ]; then echo "[STAGE 8] ($0:${LINENO}) Getting coordinates of candidate regions..."; fi
candidate_regions_file="$outdir"/"$prefix"_bigInputs.txt
if [ $overwrite -le 8 -o ! -s "$candidate_regions_file" ]; then # there should be no empty files at this stage
if [ $quiet -eq 0 ]; then echo "[STAGE 8] ($0:${LINENO}) Writing RCL candidate regions to file \"$candidate_regions_file\""; fi
if [ $verbose -eq 1 ]; then set -x; fi
Rscript --vanilla "$pdir"/bigInputs.R "$outdir" "chrList.txt" "$prefix" "$candidate_regions_file" "$ref_prefix"
set +x
if [ ! -s "$candidate_regions_file" ]; then
echo "[ERROR] Failed to create \"$candidate_regions_file\" with candidate regions for RCL."
exit 1
fi
else
if [ $quiet -eq 0 ]; then echo "[STAGE 8] ($0:${LINENO}) Using existing candidate regions file \"$candidate_regions_file\""; fi
fi
if [ $stop -eq 8 ]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 8] ($0:${LINENO}) Exiting after stage 8."; fi
exit 1
fi
### STAGE 9: copy files to RCL expected locations
# Copy final files to place consistent with run_rcl.bash.
# Force overwrite because these files are not linked to this run and can be overwritten any time.
if [ $quiet -eq 0 ]; then echo "[STAGE 9] ($0:${LINENO}) RCL input files being copied to name that run_rcl.bash expects."; fi
for bam in $indivBam; do
obam=${bam/.bam/}
for chr in $chrList; do
IN="$outdir"/$ref_prefix"$chr"/"$user_prefix"_"$obam"_"$chr".covBga.txt
if [ ! -e $IN ]; then
echo "[ERROR] Failed to create RCL input data \"$IN\"."
exit 1
fi
cp "$IN" "$outdir"/$ref_prefix"$chr"/"$obam".covBga.txt
done
done
cp "$candidate_regions_file" "$outdir"/bigInputs.txt
if [ $quiet -eq 0 ]; then
echo -e "[STAGE 9] ($0:${LINENO}) Data preprocessing has completed succesfully.\n\n"
fi
### STAGE 10: clean up temporary files
if [ $save -eq 0 ]; then
if [ $quiet -eq 0 ]; then echo "[STAGE 10] ($0:${LINENO}) Cleaning up temporary files..."; fi
if [ $verbose -eq 1 ]; then set -x; fi
rm "$outdir"/"$prefix"*-bga.bedGraph
rm "$outdir"/$ref_prefix*/"$prefix"*-bga.bedGraph
rm "$outdir"/$ref_prefix*/"$prefix"*-regions[1234].txt
set +x
fi
echo -e "RCL input data are in files:"
for rcl in "${rcl_input_files[@]}"; do
echo -e "\t$rcl"
done
echo -e "Candidate regions are in file:\n\t$candidate_regions_file"
if [[ "$nindivBam" =~ ".*\.srt\.bam" ]]; then
echo "[WARNING] We wrote sorted BAM files \"$nindivBam\". You should use these files instead of \"$indivBam\" in future runs of this script."
fi