-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_poseidon_packages.sh
executable file
·282 lines (247 loc) · 15.1 KB
/
make_poseidon_packages.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#!/usr/bin/env bash
## Function to check that a glob matches exactly one file, which exists.
existsExactlyOne() { [[ $# -eq 1 && -f $1 ]]; }
## Read in -f positional argument
force='FALSE'
force_update_switch='FALSE'
if [[ $1 == "-f" ]]; then
force="TRUE"
fi
if [[ $1 == "-u" ]]; then
force_update_switch="TRUE"
shift
update_batches="$*" ## Update jannos for all batches given after flag.
fi
mkdir -p /mnt/archgen/MICROSCOPE/poseidon_packages
update_switch="off" ## Do any packages need updating? Then update al janno files with information from pandora.
## Colours to make prompts easier to read
Yellow=$(tput sgr0)'\033[1;33m' ## Yellow normal face
Normal=$(tput sgr0)
processed_batches=''
for seq_batch in $(find /mnt/archgen/MICROSCOPE/eager_outputs/* -maxdepth 0 ! -path "*2021-01-27-Prague_bams"); do
batch_name=$(basename ${seq_batch})
## Prefer single stranded to double stranded library data for genotypes.
if [[ (${force} == "TRUE" && -f ${seq_batch}/genotyping/pileupcaller.single.geno) || (${seq_batch}/genotyping/pileupcaller.single.geno -nt /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}/POSEIDON.yml) ]]; then
update_switch="on"
update_batches+="${batch_name} " ## add batch to janno update list
echo -e "${Yellow}SSLib found for ${batch_name}${Normal}"
## If the directory already exists, delete it so trident doesn't complain
if [[ -d "poseidon_packages/${batch_name}" ]]; then
echo "Deleting existing directory poseidon_packages/${batch_name} to recreate package with new genotypes."
rm -r /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}
fi
regex="s/pileupcaller.single/${batch_name}/"
# echo $(basename $seq_batch)
trident init \
--inFormat EIGENSTRAT \
--snpSet 1240K \
--genoFile /mnt/archgen/MICROSCOPE/eager_outputs/${batch_name}/genotyping/pileupcaller.single.geno \
--snpFile /mnt/archgen/MICROSCOPE/eager_outputs/${batch_name}/genotyping/pileupcaller.single.snp \
--indFile /mnt/archgen/MICROSCOPE/eager_outputs/${batch_name}/genotyping/pileupcaller.single.ind \
-o /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name} \
-n ${batch_name}
sed -i -e ${regex} /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}/POSEIDON.yml
rename -f -e ${regex} /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}/pileupcaller.*
## Remove trailing .txt from path names. Needed for loading the data into R with admixr.
sed -i -e 's/.txt$//' /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}/POSEIDON.yml
rename -f -e 's/.txt$//' /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}/${batch_name}*txt
## Also make plink format dataset (needed for READ)
trident genoconvert -d /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name} --outFormat PLINK
processed_batches+="${batch_name} "
## If no single stranded genotypes exist, use double stranded library data for genotypes instead.
elif [[ (${force} == "TRUE" && -f ${seq_batch}/genotyping/pileupcaller.double.geno) || (${seq_batch}/genotyping/pileupcaller.double.geno -nt /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}/POSEIDON.yml) ]]; then
update_switch="on"
update_batches+="${batch_name} " ## add batch to janno update list
echo -e "${Yellow}DSLib found for ${batch_name}${Normal}"
## If the directory already exists, delete it so trident doesn't complain
if [[ -d "poseidon_packages/${batch_name}" ]]; then
echo "Deleting existing directory poseidon_packages/${batch_name} to recreate package with new genotypes."
rm -r /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}
fi
regex="s/pileupcaller.double/${batch_name}/"
# echo $(basename $seq_batch)
trident init \
--inFormat EIGENSTRAT \
--snpSet 1240K \
--genoFile /mnt/archgen/MICROSCOPE/eager_outputs/${batch_name}/genotyping/pileupcaller.double.geno \
--snpFile /mnt/archgen/MICROSCOPE/eager_outputs/${batch_name}/genotyping/pileupcaller.double.snp \
--indFile /mnt/archgen/MICROSCOPE/eager_outputs/${batch_name}/genotyping/pileupcaller.double.ind \
-o /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name} \
-n ${batch_name}
sed -i -e ${regex} /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}/POSEIDON.yml
rename -f -e ${regex} /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}/pileupcaller.*
## Remove trailing .txt from path names. Needed for loading the data into R with admixr.
## DEPRECATED 01/11/2022. Update of eager to v 2.4.5 means pileupcaller output no longer has txt suffix
# sed -i -e 's/.txt$//' /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}/POSEIDON.yml
# rename -f -e 's/.txt$//' /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}/${batch_name}*txt
## Also make plink format dataset (needed for READ)
trident genoconvert -d /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name} --outFormat PLINK
fi
done
if [[ ${force_update_switch} == "TRUE" || ${update_switch} == "on" ]]; then
for batch in ${update_batches}; do
## Set the population field in fam file to the Site ID
echo -e "${Yellow}${batch}: Setting Population name in .fam and .ind files to Pandora Site_Id.${Normal}"
fam_f="/mnt/archgen/MICROSCOPE/poseidon_packages/${batch}/${batch}.fam"
awk -v OFS="\t" -F "\t" '{$1=substr($2,1,3); print $0}' ${fam_f} >${fam_f}.2
# mv ${fam_f} ${fam_f}.old
mv ${fam_f}.2 ${fam_f}
## Set the population field in ind file to the Site ID
ind_f="/mnt/archgen/MICROSCOPE/poseidon_packages/${batch}/${batch}.ind"
awk -v OFS="\t" -F "\t" '{$3=substr($1,1,3); print $0}' ${ind_f} >${ind_f}.2
# mv ${ind_f} ${ind_f}.old
mv ${ind_f}.2 ${ind_f}
## Add Site_ID and Site_Name to .janno
echo -e "${Yellow}${batch}: Updating package metadata using eager2poseidon${Normal}"
janno_f="/mnt/archgen/MICROSCOPE/poseidon_packages/${batch}/${batch}.janno"
## Make Group_Name of janno be the Site_ID from pandora, to match fam and ind files.
temp_janno="$(dirname ${janno_f})/temp_janno.janno"
head -n1 ${janno_f} > ${temp_janno}
## Group_Name is column 3 in poseidon 2.5.0 packages made with trident 0.26.1 +. Used to be column 19 in the past.
tail -n +2 ${janno_f} | awk -F '\t' -v OFS='\t' '{$3=substr($1,1,3); print $0}' >> ${temp_janno}
eager_result_dir="/mnt/archgen/MICROSCOPE/eager_outputs/${batch}"
## Keep only ssDNA data when ssDNA data is available
## If one and only one file matching *single*geno* exists in the genotyping dir of the eager output for the batch, then prefer single stranded data.
if existsExactlyOne ${eager_result_dir}/genotyping/*single*geno*; then
library_preference='single'
ss_suffix='-S _ss'
else
library_preference='double'
ss_suffix=''
fi
## Run eager2poseidon
## Requires poseidonR from this version. Newer version clashes with something ## remotes::install_github('poseidon-framework/poseidonR', ref='99f1e9c9954d9b7e79e796c24e90036f2576a112')
echo "~/Software/github/sidora-tools/eager2poseidon/exec/eager2poseidon.R \
--input_janno ${temp_janno} \
--eager_tsv /mnt/archgen/MICROSCOPE/eager_inputs/${batch}.eager_input.tsv \
--general_stats_table ${eager_result_dir}/multiqc/multiqc_data/multiqc_general_stats.txt \
--credentials ~/Software/github/Schiffels-Popgen/MICROSCOPE-processing-pipeline/.credentials \
--genotypePloidy haploid \
--snp_cutoff 100 \
--keep_only ${library_preference}" \
${ss_suffix} | tr -s " "
~/Software/github/sidora-tools/eager2poseidon/exec/eager2poseidon.R \
--input_janno ${temp_janno} \
--eager_tsv /mnt/archgen/MICROSCOPE/eager_inputs/${batch}.eager_input.tsv \
--general_stats_table ${eager_result_dir}/multiqc/multiqc_data/multiqc_general_stats.txt \
--credentials ~/Software/github/Schiffels-Popgen/MICROSCOPE-processing-pipeline/.credentials \
--genotypePloidy haploid \
--snp_cutoff 100 \
--keep_only ${library_preference}\
${ss_suffix}
## Keep original janno just as backup.
mv ${janno_f} ${janno_f}.old
mv ${temp_janno} ${janno_f}
## If an identical twin annotation file exists for the batch, then update the Alternative_ID field of the `into_this` individual in the janno file to denote it contains data for both Pandora IDs,
## and rename the population field of the merge_this individual so the data is not in the population twice.
if [[ -f /mnt/archgen/MICROSCOPE/identical_twins_per_batch/${batch}.identical_twins.tsv ]]; then
echo -e "${Yellow}${batch}: Marking identical twins/duplicate individuals in janno file ${Normal}"
## Infer the index of the Group_Name and Alternative_ID columns in the janno file
group_name_col=$(head -n1 ${janno_f} | tr '\t' '\n' | grep -n 'Group_Name' | cut -d':' -f1)
alt_id_col=$(head -n1 ${janno_f} | tr '\t' '\n' | grep -n 'Alternative_IDs' | cut -d':' -f1)
note_field_col=$(head -n1 ${janno_f} | tr '\t' '\n' | grep -wn 'Note' | cut -d':' -f1)
relation_to_col=$(head -n1 ${janno_f} | tr '\t' '\n' | grep -wn 'Relation_To' | cut -d':' -f1)
relation_degree_col=$(head -n1 ${janno_f} | tr '\t' '\n' | grep -wn 'Relation_Degree' | cut -d':' -f1)
relation_note_col=$(head -n1 ${janno_f} | tr '\t' '\n' | grep -wn 'Relation_Note' | cut -d':' -f1)
merge_these=()
into_these=()
while read -r merge_this into_this; do
## Skip header
if [[ ${merge_this} == "merge_this" ]]; then
continue
fi
## Create bash array with all entries in the merge_this and into_this columns
merge_these+=("${merge_this}")
into_these+=("${into_this}")
done < /mnt/archgen/MICROSCOPE/identical_twins_per_batch/${batch}.identical_twins.tsv
## Pass the full arrays to awk and update the janno file
## WARNING This awk command will not pick up on triplets of identical individuals. one individual will be lost in the Janno.
awk \
-v group_name_col="${group_name_col}" \
-v alt_id_col="${alt_id_col}" \
-v merge_these="${merge_these[*]}" \
-v into_these="${into_these[*]}" \
-v note_field_col="${note_field_col}" \
-v relation_to_col="${relation_to_col}" \
-v relation_degree_col="${relation_degree_col}" \
-v relation_note_col="${relation_note_col}" \
-v n="${#merge_these[@]}"\
-F "\t" \
-v OFS="\t" \
'
BEGIN{
split(merge_these,a," ")
split(into_these ,b," ")
for (x=1;x<=n;++x) {
tsv_merge_this[a[x]]=a[x]
tsv_into_this[b[x]]=b[x]
merged_into[a[x]]=b[x]
merged_from[b[x]]=a[x]
}
}
## With identical indivs TSV looking like this:
## merge_this into this
## ABC001 ABC002
##
## tsv_merge_this = ["ABC001" : "ABC001" ]
## tsv_into_this = ["ABC002" : "ABC002" ]
## merged_into = ["ABC001" : "ABC002" ]
## merged_from = ["ABC002" : "ABC001" ]
{
if ( $1 in tsv_merge_this ) {
$group_name_col = $group_name_col"_data_merged_into_"merged_into[$1]
$relation_to_col = merged_into[$1]
$relation_degree_col = "identical"
$relation_note_col = "Identity of individuals was specified in identical_twins annotation file."
} else if ( $1 in tsv_into_this ) {
$alt_id_col = $1"_"merged_from[$1]"_merged"
$note_field_col = "This Poseidon_ID contains the merged data from identical individuals "$1" and "merged_from[$1]"."
$relation_to_col = merged_from[$1]
$relation_degree_col = "identical"
$relation_note_col = "Identity of individuals was specified in identical_twins annotation file."
}
print $0
}
' ${janno_f} > ${janno_f}.2
## Replace origianl janno file with updated version
mv ${janno_f}.2 ${janno_f}
fi
echo -e "${Yellow}${batch}: Adding genetic sex to fam and ind files ${Normal}"
## Update genetic sex in .fam file based on sexes in janno file
mv ${fam_f} ${fam_f}.old ## Keep old fam file around for testing
## First paste together the fam and janno files, throw away the header line, then use awk to update the sex in the fam file based on the janno file
## Additionally, replace the group name in the fam file ($1) with the new Group_Name in the janno file ($9)
## Results are saved to ${fam_f}
(echo ""; cat ${fam_f}.old) | \
paste - <( awk -F "\t" -v OFS="\t" '{print $1,$2,$3}' ${janno_f}) | \
tail -n +2 | \
awk -F "\t" -v OFS="\t" '{
if ($8 == "M") {
$5=1
} else if ($8 == "F") {
$5=2
}; print $9, $2, $3, $4, $5, $6}' > ${fam_f}
## Update genetic sex and group name in .ind files
mv ${ind_f} ${ind_f}.old
awk -F "\t" -v OFS="\t" '{print $1,$2,$3}' ${janno_f} | tail -n +2 >${ind_f}
echo -e "${Yellow}${batch}: Updating POSEIDON.yml hashes ${Normal}"
## Then update the hashes in POSEIDON.yml
trident update -d $(dirname ${ind_f})
## Finally, validate the produced package
trident validate -d /mnt/archgen/MICROSCOPE/poseidon_packages/${batch_name}
if [[ $? != 0 ]]; then
echo -e "${Red}${batch}: Package '${batch}' failed validation. Halting execution.${Normal}"
else
echo -e "${Yellow}${batch}: Package '${batch}' processed.${Normal}"
fi
done
else
echo "No packages needed updating. Halting execution."
fi
## Print out the batches that got updates for posting on Mattermost
if [[ ${processed_batches} != '' ]]; then
echo "$(date -Idate) Poseidon package updates:"
fi
for processed_batch in ${processed_batches}; do
echo " - \`${processed_batch}\`"
done