-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreal_data_pipeline
108 lines (84 loc) · 4.58 KB
/
real_data_pipeline
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#prepare the reference genome
cd /mnt/c/Fraca_raw
out="PopB_pooled_L4" #25 individuals
out_dir="/mnt/c/Fraca_out/"
Fraca_dir="/mnt/H/julia/ngsJulia/ngsPool/pooled_WGS/"
chr_pool=2*25 #Arabidopsis lyrata has 8 chromosomes (without information specified from Fraca, assume they're all pooled) but diploid
#chromosome pooled=ploidy * nIndividual
min=4 #read depth should be at least 4
#depth depends on L1-L4
max=$chr_pool*$depth # maximum coverage: 50(chromosome=ploidy * individual)*10(depth per individual?)=ploidy * depth
min_qual=20 #base quality
l_npstat=1000 #bp of the windows for NPStat
min_all=2 #minimum allele count for Snape, VarScan and NPStat
pp_snape=0.9 #posterior probability threshold for Snape
#process_dir:
# mkdir ref
# mkdir map
# mkdir out
cp ref.fasta ref
bwa index ref/ref.fasta
gzip -d $out"_2.fastq.gz"
gzip -d $out"_1.fastq.gz" # only FASTQ usually stored in GZ due to memory, fasta occupy much less
#map the reads to the ref genome
bwa aln -n 0.01 -l 100 -o 1 -d 12/ -e 12 -t 8 ref/ref.fasta $out"_1.fastq" > map/$out"_1.sai"
bwa aln -n 0.01 -l 100 -o 1 -d 12/ -e 12 -t 8 ref/ref.fasta $out"_2.fastq" > map/$out"_2.sai"
bwa samse ref/ref.fasta map/$out"_1.sai" $out"_1.fastq" > map/$out"_1.sam"
bwa samse ref/ref.fasta map/$out"_2.sai" $out"_2.fastq" > map/$out"_2.sam"
gzip $out"_1.fastq"
gzip $out"_2.fastq"
# Creat mpileup file
samtools mpileup -f ref/ref.fasta -B map/$out"_1_2.bam" map/$out"_2_2.bam" > $out.mpileup
#-r 8scaffold.bed, without bed file works all right
#-b (input bam file list) as -B (disable BAQ (per-Base Alignment Quality))
wc *mpileup
##Use mpileup file to compare the performance####################################
cd /mnt/c/Fraca_raw
out="PopB_pooled_L4" #the name of file analyzed
Fraca_dir="/mnt/H/julia/ngsJulia/ngsPool/pooled_WGS/" #snape and varscan not written but used by Fraca
Popoo_dir="/mnt/H/julia/ngsJulia/ngsPool/popoolation2_1201/"
#1.Popoolation
echo "perl time"
time perl $Popoo_dir"mpileup2sync.pl" --fastq-type sanger --min-qual 20 --input $out.mpileup --output out/$out.sync
##should the sanger be illuminia here? as it's pooled data. Or the sanger just refer to the fastq type...
#or
echo "java time"
time java -ea -Xmx7g -jar $Popoo_dir"mpileup2sync.jar" --input $out.mpileup --fastq-type sanger --min-qual 20 --threads 8 --output out/$out"_java.sync"
#much less time
#snape or varscan ########
#2. Snape---------------------------------------------
#The nucleotide diversity and the genetic differentiation from the reference genome that are needed
#to set prior probabilities in the Bayesian model of Snape were calculated by NPStat [37].
#in linux run `gcc NPStat-v1.c -I$GSL_HOME/include -L$GSL_HOME/lib -lgsl -lgslcblas -lm` to install the npstat
$Fraca_dir"npstat" -n $chr_pool -l $l_npstat -mincov $min -maxcov $max -minqual $min_qual -nolowfreq $min_all $out.mpileup > out/$out.stats
#omitted -outgroup $scaffold_fa
#will output $out".stats" in default
#NR: number of records variable (awk NR gives the total number of records being processed or line number)
theta=$(awk '{sum+=$6} END {print sum/NR}' out/$out".mpileup.stats") #here $out"_filt.mpileup.stats" as input file
#nucleotide diversity
#mean(sum/NR) of the 6th column
D=$(awk '{sum+=$13} END {print sum/NR}' out/$out".mpileup.stats")
#Prior genetic difference between reference genome and population
echo "#Chromosomes $out, theta is $theta, D is $D" > out/$out"_stat" #won't print
$Fraca_dir"snape-pooled" -nchr $chr_pool -theta $theta -D $D -fold unfolded -priortype informative < $out.mpileup | awk '$9 > '$pp_snape'' | awk '$5 >='$min_all'' > out/$out.snape
#$out"_filt.*" => $out".*"
#wc -l out/$out".snape" >> out/$out"_stat"
#3. VarScan--------------------------------------------------------
echo "VarScan time"
java -Xmx2g -jar $Fraca_dir"VarScan.v2.3.9.jar" mpileup2snp $out".mpileup" --min-coverage $min --min-avg-qual $min_qual --min-reads2 $min_all --p-value 0.05 > out/$out.varscan
#use mpileup2snp instead of pileup2snp
#also not mpileup2cns(SNP/Indel/Reference)
#$out"_filt.*" => $out".*"
#wc -l out/$out".varscan" >> out/$out"_stat"
#4. ngsPool---------------------------------------------------------
#julia ngsPool.jl
gzip $out.mpileup
echo "julia time"
cd /mnt/c/Fraca_raw
out="PopB_pooled_L4"
time julia /mnt/H/julia/ngsJulia/ngsPool/ngsPool.jl --fin $out.mpileup.gz --fout out/$out"-snp.out.gz" --nChroms 25
#--lrtSnp 7.82 as default if not specified
# 14 individuals in population A, 25 in population B
time julia /mnt/H/julia/ngsJulia/ngsPool/ngsPool.jl --fin $out.mpileup.gz --fout out/$out.out.gz --nChroms 25 --lrtSnp -Inf
#real 87m42.696s
#user 61m31.125s