This repository has been archived by the owner on Jul 10, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathposeidon2_merge.sh
executable file
·180 lines (163 loc) · 5.25 KB
/
poseidon2_merge.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# Author: Clemens & Ayshin
_merge() {
# catch input variables
_input_file_with_list_of_poseidon_packages=${1}
_output_directory=${2}
# prepare other variables
_current_date=${3}
_output_files_name="poseidon2_merge_${_current_date}"
_log_file_directory=${4}
_plink_input_file="${_log_file_directory}/poseidon2_merge_plink_input_file.txt"
_plink_order_file="${_log_file_directory}/poseidon2_merge_plink_order_file.txt"
# start message
_merge_start_message ${_input_file_with_list_of_poseidon_packages} ${_output_directory} ${_output_files_name} ${_log_file_directory}
_print_packages ${_input_file_with_list_of_poseidon_packages}
# check if the input packages are valid
_check_packages ${_input_file_with_list_of_poseidon_packages}
# make output directory
mkdir -p ${_output_directory}
# run steps
_create_binary_file_list_file ${_input_file_with_list_of_poseidon_packages} ${_plink_input_file}
_janno_merge ${_input_file_with_list_of_poseidon_packages} ${_output_directory} ${_output_files_name}
_create_order_file_from_fam_files ${_input_file_with_list_of_poseidon_packages} ${_plink_order_file}
_plink_merge ${_plink_input_file} ${_plink_order_file} ${_output_directory} ${_output_files_name} ${_log_file_directory}
printf "\\n"
}
_merge_start_message() {
cat << EOF
_ _ ____
____ ___ ___ ___(_) __| | ___ ____ |___ \
| _ \ / _ \/ __|/ _ \ |/ _ |/ _ \| _ \ __) |
| |_) | (_) \__ \ __/ | (_| | (_) | | | |/ __/
| __/ \___/|___/\___|_|\____|\___/|_| |_|_____|
|_|
merge => Merges multiple poseidon directories
Input file with package list: ${1}
Output directory: ${2}
Output file name: ${3}.*
Log file directory: ${4}
EOF
}
_print_packages() {
printf "Packages to be merged:\\n"
_input_file=${1}
# loop through all packages directories
while read p; do
# ignore empty names and lines starting with in the input dir list
case ${p} in
''|\#*) continue ;;
esac
printf "=> ${p}\\n"
done <${_input_file}
printf "\\n"
}
_check_packages() {
_input_file=${1}
# loop through all packages directories
while read p; do
# ignore empty names and lines starting with in the input dir list
case ${p} in
''|\#*) continue ;;
esac
_check_if_valid_package ${p}
done <${_input_file}
printf "\\n"
}
_create_binary_file_list_file() {
printf "Creating input file for plink merge...\\n"
_input_file=${1}
_output_file=${2}
rm -f ${_output_file}
touch ${_output_file}
# loop through all packages directories
while read p; do
# ignore empty names and lines starting with in the input dir list
case ${p} in
''|\#*) continue ;;
esac
# loop through relevant file types (bed, bim, fam)
_file_list=""
for extension in bed bim fam
do
_new_file=$(find "${p}/" -name "*.${extension}")
_file_list="${_file_list} ${_new_file}"
done
# write result to output file
echo "${_file_list}" >> ${_output_file}
done <${_input_file}
# print output file path
printf "=> ${_output_file}\\n"
}
_merge_multiple_files_with_header() {
_output_file=${1}
shift
_input_files=("$@")
head -1 ${_input_files[0]} > ${_output_file}
tail -n +2 -q ${_input_files[@]} >> ${_output_file}
}
_janno_merge() {
printf "Merge janno files...\\n"
_input_file=${1}
_output_file="${2}/${3}.janno"
# loop through all packages directories
unset _janno_files
while read p; do
# ignore empty names and lines starting with in the input dir list
case ${p} in
''|\#*) continue ;;
esac
_new_file=$(find "${p}/" -name "*.janno" -not -path '*/\.*')
if [ -z "${_new_file}" ]
then
continue
fi
_check_if_newline_at_eof ${_new_file}
_janno_files+=("${_new_file}")
done <${_input_file}
# merge resulting janno files
_merge_multiple_files_with_header "${_output_file}" "${_janno_files[@]}"
# print output file path
printf "=> ${_output_file}\\n"
}
_check_if_newline_at_eof() {
if ! _file_ends_with_newline $1
then
printf "Missing newline at the end of:\\n=> $1\\n\\n"
exit 1
fi
}
_file_ends_with_newline() {
[[ $(tail -c1 "$1" | wc -l) -gt 0 ]]
}
_merge_multiple_files_and_cut_first_two_columns() {
_output_file=${1}
shift
_input_files=("$@")
cat ${_input_files[@]} | cut -f 1,2 -d " " > ${_output_file}
}
_create_order_file_from_fam_files() {
printf "Merge fam files to get order file...\\n"
_input_file=${1}
_output_file=${2}
# loop through all packages directories
unset _fam_files
while read p; do
# ignore empty names and lines starting with in the input dir list
case ${p} in
''|\#*) continue ;;
esac
_new_file=$(find "${p}/" -name "*.fam" -not -path '*/\.*')
if [ -z "${_new_file}" ]
then
continue
fi
_fam_files+=("${_new_file}")
done <${_input_file}
_merge_multiple_files_and_cut_first_two_columns "${_output_file}" "${_fam_files[@]}"
# print output file path
printf "=> ${_output_file}\\n"
}
_plink_merge() {
printf "Merge genome data with plink...\\n=> "
sbatch -p "short" -c 4 --mem=10000 -J "poseidon2_merge" -o "${5}/poseidon2_%j.out" -e "${5}/poseidon2_%j.err" --wrap="plink --merge-list ${1} --make-bed --indiv-sort f ${2} --keep-allele-order --out ${3}/${4} && mv ${3}/${4}.log ${5}/plink.log"
}