-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_data_from_identical_twins.sh
executable file
·66 lines (53 loc) · 3.36 KB
/
merge_data_from_identical_twins.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env bash
## This script is ran on the eager input TSV of a batch only if an identical twins annotation file exists for the batch.
## It duplicates the rows of the individual in the merge_this column, and replaces the individual ID with the one in the into_this column.
## To avoid file name collisions, it also creates symlinks for the R1/R2 columns of the merge_this individual, and points to those symlinks in the R1/R2 of the duplicated rows.
## Usage:
## merge_data_from_identical_twins.sh <input_eager_tsv> <identical_twins_tsv>
DIR="/mnt/archgen/MICROSCOPE"
input_eager_tsv=$(readlink -f "${1}")
identical_twins_tsv=$(readlink -f "${2}")
temp_tsv="${DIR}/.tmp/tsvs/$(basename ${input_eager_tsv}).tmp"
symlink_dir="$(dirname ${identical_twins_tsv})/symlinks/$(basename -s .identical_twins.tsv ${identical_twins_tsv})"
## Copy the input eager TSV to a temporary file
cp ${input_eager_tsv} ${temp_tsv}
## Read through the identical twins annotation file and create symbolic links for the R1/R2 columns of the merge_this individual, then
## duplicate the rows of the individual in the merge_this column, and replace the individual ID with the one in the into_this column, and the R1/R2 columns with the symlinks created above.
while read -r merge_this into_this; do
if [[ ${merge_this} == 'merge_this' ]]; then
## Ignore the header line
continue
fi
## Read and unpack the lines of the input tsv that correspond to the merge_this individuals (with or without the'_ss' suffix)
while IFS=$'\t' read -r sample_name library_id lane colour_chemistry seqtype organism strandedness udg_treatment r1 r2 bam ; do
data_target_ind="${into_this}"
## If data is single-stranded, then the sample name will be suffixed with '_ss', and the target individual ID should be suffixed with '_ss' as well.
if [[ ${sample_name} != ${merge_this} ]] && [[ ${sample_name} == ${merge_this}_ss ]]; then
data_target_ind="${into_this}_ss"
fi
## DEBUG
# for field in sample_name library_id lane colour_chemistry seqtype organism strandedness udg_treatment r1 r2 bam; do
# echo -e "${field}:\t${!field}"
# done
# continue
## Create symlinks for the R1/R2/BAM columns of the merge_this individual
mkdir -p ${symlink_dir}
for field in r1 r2 bam; do
if [[ ${!field} == 'NA' ]]; then
eval "new_${field}='NA'"
else
target="${symlink_dir}/${data_target_ind}_$(basename ${!field})"
eval "new_${field}=${target}"
ln -s ${!field} ${target}
fi
done
## Duplicate the rows of the individual in the merge_this column, and replace the individual ID with the one in the into_this column, prefix the library_ID with the target Individual ID, and replace the R1/R2 columns with the symlinks created above.
echo -e "${data_target_ind}\t${data_target_ind}_${library_id}\t${lane}\t${colour_chemistry}\t${seqtype}\t${organism}\t${strandedness}\t${udg_treatment}\t${new_r1}\t${new_r2}\t${new_bam}" >> ${temp_tsv}
done < <(awk -v merge_this="${merge_this}" 'BEGIN{OFS=IFS="\t"} $1 == merge_this || $1 == merge_this"_ss"' ${input_eager_tsv})
done < ${identical_twins_tsv}
## Finally, make a backup of the input eager TSV and then replace it with the temporary file
cp ${input_eager_tsv} ${input_eager_tsv}.bak
## Only replace file if backup was successful
if [[ $? -eq 0 ]]; then
mv ${temp_tsv} ${input_eager_tsv}
fi