-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunc_read_standard.jl
367 lines (314 loc) · 13.9 KB
/
func_read_standard.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
#func_read_plot2(2020/8/12)
using CSV, CodecZlib, Plots, DelimitedFiles, Plots.PlotMeasures, DataFrames
#form: :chrom(string), :position(Int), :algorithm_software(float)]
#name.extension
#standard input form
# "$name.txt" #population allele frequency simulated
# "$name.snp.gz" #estimation of ngsPool with SNP calling
# "$name.sync" #Popoolation2
# "$name.snape"
# "$name.varscan"
#use the functions
# df_snp=read_ngsPool_snp(name)
# df_true = read_true(name)
# df_popoo = read_p2(name)
# varscan_p=read_varscan(name)
# varscan_nallele= read_nallele_varscan2(name) #mumber of reference or alternative allele
# snape_p, snape_nallele = read_snape(name)
#read the gzipped file of ngsPool with SNP calling
function read_ngsPool_snp(name) #all are SNP calling
if occursin("snp", name)
maf_snp = open(GzipDecompressorStream, "$name.snp.gz", "r") do stream #estimated
CSV.read(stream) #output to df
end #1000 rows--sites, maf as dataframe
else
maf_snp = open(GzipDecompressorStream, "$name.gz", "r") do stream #estimated
CSV.read(stream) #output to df
end #1000 rows--sites, maf as dataframe
end
df_snp = hcat(maf_snp[!,1], maf_snp[!, [:position, :maf,:freqMax, :freqE]])
rename!(df_snp, [:chrom, :position, :GSS_ngsPool,:freqMax_ngsPool, :freqE_ngsPool])
#Golden-section search (GSS); maf with maximum SFS likelihood, expected maf with SFS likelihood
return df_snp #dataframe
end
#read the gzipped file of ngsPool with SNP calling
function read_ngsPool_snp2(name) #no snp in the name
maf_snp = open(GzipDecompressorStream, "$name.gz", "r") do stream #estimated
CSV.read(stream) #output to df
end #1000 rows--sites, maf as dataframe
df_snp = hcat(maf_snp[!,1], maf_snp[!, [:position, :maf]]) #:freqMax, :freqE, those three are too close so
rename!(df_snp, [:chrom, :position, :GSS_ngsPool]) #,:freqMax_ngsPool, :freqE_ngsPool
#Golden-section search (GSS); maf with maximum SFS likelihood, expected maf with SFS likelihood
return df_snp #dataframe
end
# cd("C:\\simu_raw\\sig")
# name="simu-1-100-0.1"
# df_snp=read_ngsPool_snp(name)
#read txt file output while simulating pool data
function read_true(name)
# if occursin("txt", name)
daf = readdlm("$name.txt", '\t', String, '\n') #true #daf as matrix
daf_1 = daf[:, 1]
daf_2 = parse.(Int64, daf[:, 2])
daf_end = parse.(Float64, daf[:, 5])#wrong, true maf is in 5th column, the end column is the simple reads proportion of reads(no meaning)
df_true = DataFrame(chrom = daf_1, position = daf_2, true_maf = daf_end)
# elseif occursin("gz", name)
# daf = open(GzipDecompressorStream, name, "r") do stream #estimated
# CSV.read(stream, header=false) #output to df
# end #1000 rows--sites, maf as dataframe
# df_true = daf[!,[1,2,5]]
# df_true = rename!(df_true, [:chrom, :position, :true_maf])
# end
return df_true
end
# cd("C:\\simu_raw\\sig")
# name="test-1-100"
# df_true = read_true(name)
# df_snp2=innerjoin(df_snp, df_true, on=[:chrom, :position])
# df_snp3=sort(df_snp2, :true_maf)
# mtx_snp3=convert(Matrix,df_snp3[3:end])
# scatter(mtx_snp3[:,end], mtx_snp3[:,1:end-1])
# plot(mtx_snp3)
# #differ but very close
#read the output of Popoolation2
function read_p2(name) #Popoolation2
# println("$name.sync")
daf = readdlm("$name.sync", '\t', String, '\n')
daf_count = daf[:, 4]
nsite = length(daf_count)
mafs = zeros(nsite)
for i in 1:length(daf_count)
count = parse.(Int, split(daf_count[i], ":"))
maf = sort(count)[end-1]/sum(count) #minor allele frequency
mafs[i] = maf
end
daf_2 = parse.(Int, daf[:,2])
df_final = DataFrame(chrom=daf[:,1], position=daf_2, popoo=mafs)
return df_final
end
# cd("C:\\Fraca_raw\\out")
# name="PopA_pooled_java"
# df_popoo = read_p2(name)
#
# count(i->i!=0, df_popoo[:,3]) #SNP calling: 7870 out of 44856
#read varscan estimation file and output Minor allele frequencies
function read_varscan(name) #input such as "PopB_pooled_L4"
raw=CSV.read("$name.varscan", header=true)
#filter out the sites filtered by min coverage
NA_row=[]
all_row=collect(1:length(raw[!,end]))
for i in all_row
if occursin(r":-:-:-:-", raw[i,end]) #filter out the row not called
push!(NA_row, i)
end
end
left_row = filter(x->!(x in NA_row), eachindex(all_row))
raw=raw[left_row,:]
cross_samp = raw[:, 5]
each_sample = raw[:, end]
nr = length(cross_samp) #nrow
maf_p_mtx = Array{String}(undef,nr,4) #1,2col cross-sample; 3,4 each sample
for i in 1:nr
maf_p_mtx[i,[1,2]]=split(cross_samp[i], ":")[[5,6]] #maf and fisher's exact test(FET) p value
maf_p_mtx[i,[3,4]]=split(each_sample[i], ":")[[5,6]] #maf and fisher's exact test(FET) p value
#maf frequency and Fisher's exact test p value
end
#convert string to decimals
maf_p_mtx2=Array{Float64}(undef,nr,4)
for i in 1:nr, j in [1,3]
# println([i, j])
# println(maf_p_mtx[i, j])
# println(each_sample[i, end])
maf_p_mtx2[i, j] = parse(Float64, chop(maf_p_mtx[i, j]))/100
end
for i in 1:nr
p = maf_p_mtx[i, 2]
maf_p_mtx2[i, 2] = parse(Float64, p)
end
maf_p_mtx3=DataFrame(maf_p_mtx2, [:cross_samp_maf, :cross_samp_p, :each_samp_maf, :each_samp_p])
maf_p_mtx_final=hcat(raw[:,1:2], maf_p_mtx3)
rename!(maf_p_mtx_final, [:chrom,:position,:cross_samp_maf, :cross_samp_p, :each_samp_maf, :each_samp_p])
return maf_p_mtx_final
end
# cd("C:\\Fraca_raw\\out")
# using CSV, DataFrames
# name="PopB_pooled_L2" #all [33,3] only L2 [34,3]
# name="PopB_pooled_L4"
# varscan_p=read_varscan(name)
# #check the - problem############################
# raw=CSV.read("$name.varscan", header=true)
# for i in 1:length(raw[!,end])
# if occursin(r":-:-:-:-", raw[i,end])
# println(i)
# end
# end
#
# for i in 1:length(raw[!,5])
# if occursin(r":-:-:-:-", raw[i,end])
# println(i)
# end
# end
# #N:2:-:-:-:- G:11:0:11:100%:1.4176E-6 #what amendment after renewing? why? before it can function
# #same sites absent in each-sample and cross-sample test(speculate due to min coverage 4)
# #lose around 20/98 sites
# #consider set the minor allele count as 2?
# #and ignore those filtered sites (if)
################################################
#read varscan estimation file and output the number of reads of varscan
function read_nallele_varscan2(name) #input such as "PopB_pooled_L4"
raw=CSV.read("$name.varscan", header=true)
#filter out the sites filtered by min coverage
NA_row=[]
all_row=collect(1:length(raw[!,end]))
for i in all_row
if occursin(r":-:-:-:-", raw[i,end])
push!(NA_row, i)
end
end
left_row = filter(x->!(x in NA_row), eachindex(all_row))
raw=raw[left_row, :]
cross_samp = raw[:,5]
each_sample = raw[:,end]
nr = length(cross_samp) #nrow
nallele_mtx=DataFrame(Array{Int}(undef, nr, 4), [:ref_crs_varscan, :alt_crs_varscan,:ref_each_varscan, :alt_each_varscan])
#1,2col cross-sample; 3,4 each sample
for i in 1:nr , j in [3,4]
c_i=split(cross_samp[i], ":")[j][1]
nallele_mtx[i, j-2]=parse(Int, c_i)
#maf frequency and Fisher's exact test p value
end
for i in 1:nr , j in [3,4]
e_i=split(each_sample[i], ":")[j][1]
nallele_mtx[i, j]=parse(Int, e_i)
#maf frequency and Fisher's exact test p value
end
nallele_mtx=hcat(raw[:,1:2], nallele_mtx)
rename!(nallele_mtx, [:chrom,:position,:ref_crs_varscan, :alt_crs_varscan,:ref_each_varscan, :alt_each_varscan])
return nallele_mtx
end
# cd("C:\\Fraca_raw\\out")
# using CSV, DataFrames
# name="PopB_pooled_L4"
# varscan_nallele= read_nallele_varscan2(name) #mumber of reference or alternative allele
#read the estimation file of Snape and output two files (_p as MAFs, _num as the number of reads)
function read_snape(name)
# name="PopB_pooled_L4"
raw=CSV.read("$name.snape", header=false)
if isempty(raw)
println("$name.snape is empty")
return DataFrame(), DataFrame()
end
final_p=raw[:,[1,2,end]]
rename!(final_p, [:chrom,:position,:maf_snape])
final_num=raw[:,[1,2,3,4]]
rename!(final_num, [:chrom,:position,:ref_snape,:alt_snape])
return final_p, final_num
end
# cd("C:\\Fraca_raw\\out")
# using CSV, DataFrames
# name="PopB_pooled_L4"
# snape_p, snape_nallele = read_snape(name)
#read varscan
##Root mean square error with input in different data types
RMSE(maf::Float64, true_maf::Float64) = sqrt(sum( (maf .- true_maf).^2 )/ length(maf) )
RMSE(maf::Float64, true_maf::Missing) = missing
RMSE(maf::Missing, true_maf::Float64) = missing
RMSE(maf::Missing, true_maf::Missing) = missing
#calculate Mean percentage error
#input as two numbers
MPE(maf::Float64, true_maf::Float64) = (maf - true_maf) / true_maf
#delete the entry with true_maf=0, might not be the standard MPE methods
#input as two vectors
function MPE(maf::Array, true_maf::Array) #input two vectors and get one value (vectors have to be the replicates)
if !ismissing(maf) && !ismissing(true_maf)
nzero = count(iszero, true_maf)
mpe_itr = (maf .- true_maf) ./ true_maf
mpe_itr[true_maf .== 0] .= 0
mpe = sum(mpe_itr) / (size(maf)[1]-nzero) #not include the entry with true as 0
return mpe
else
return missing
end
end
################################################################################
#The annotation of the outputs of ngsPool, Popoolation2, Snape and VarScan
################################################################################
##ngsPool output######################################
#the first row:
# chrom position reference nonreference major minor lrtSNP lrtBia lrtTria maf freqMax freqE
#corresponding annotation:
#chromosome, position,reference allele, nonreference allele, major allele, minor allele,
#likelihood ration test statistic (lrt) of SNP, lrt of biallelic site, lrt of triallelic,
#minor allele frequency (from Golden Section Search),
#Maximum likelihood estimation (from SFS)
#Expected estimation (from SFS)
##Popoolation2######################################
# (source: https://sourceforge.net/p/popoolation2/wiki/Tutorial/)
# Sample of a synchronized file:
#
# 2R 2302 N 0:7:0:0:0:0 0:7:0:0:0:0
# 2R 2303 N 0:8:0:0:0:0 0:8:0:0:0:0
# 2R 2304 N 0:0:9:0:0:0 0:0:9:0:0:0
# 2R 2305 N 1:0:9:0:0:0 0:0:9:1:0:0
# col1: reference contig
# col2: position within the refernce contig
# col3: reference character
# col4: allele frequencies of population number 1
# col5: allele frequencies of population number 2
# coln: allele frequencies of population number n
# The allele frequencies are in the format A:T:C:G:N:del, i.e: count of bases 'A',
# count of bases 'T',... and deletion count in the end (character '*' in the mpileup)
##Snape output######################################
#10 fields:
#1. chr: genomic coordinates
#2. position along the chronosome
#3. ref nucleotides
#4. num of minor (alternative) nucleotide
#5. mean quality of the reference nucleotide
#6. mean quality of the alternative nucleotide
#7. first and second most frequent nucleotides in the pileup
#8. 1-p(0), p(f) as the probability distribution function for the maf (f)
#9. p(1)
#10. mean value of f
# what's num of minor (alternative) nucleotide for?
# 5.6. are around 60 and 70, seems unrealistic(0.0001% error rate) compared to
#(Kofler et al., 2011) Popoolation, should be around 20 to 30
#what's the unit?
# 1-p(0)-p(1) is the probability of SNP? Any threshold?
# why 1-p(0) can be greater than 1? (no matter accumulative or density)
# can 10. be compared with the expected mean in ngsPool?
##Varscan#############################################
# OUTPUT
# Tab-delimited SNP calls with the following columns:
# 1.Chrom chromosome name
# 2.Position position (1-based)
# 3.Ref reference allele at this position
# 4.Var variant allele observed
# 5.PoolCall Cross-sample call using all data (Cons:Cov:Reads1:Reads2:Freq:P-value)
# 1)Cons - consensus genotype in IUPAC format
# 2)Cov - total depth of coverage
# 3)Reads1 - number of reads supporting reference
# 4)Reads2 - number of reads supporting variant
# 5)Freq - the variant allele frequency by read count
##5.5)lower false discovery rate in SNP calling
# 6)P-value - FET p-value of observed reads vs expected non-variant
# 6.StrandFilt Information to look for strand bias using all reads (R1+:R1-:R2+:R2-:pval)
# R1+ = reference supporting reads on forward strand
# R1- = reference supporting reads on reverse strand
# R2+ = variant supporting reads on forward strand
# R2- = variant supporting reads on reverse strand
# pval = FET p-value for strand distribution, R1 versus R2
# 7.SamplesRef Number of samples called reference (wildtype)
# 8.SamplesHet Number of samples called heterozygous-variant
# 9.SamplesHom Number of samples called homozygous-variant
# 10.SamplesNC Number of samples not covered / not called
# 11.SampleCalls The calls for each sample in the mpileup, space-delimited
# Each sample has six values separated by colons:
# Cons - consensus genotype in IUPAC format
# Cov - total depth of coverage
# Reads1 - number of reads supporting reference
# Reads2 - number of reads supporting variant
# Freq - the variant allele frequency by read count
##11.5)collect as
# P-value - FET p-value of observed reads vs expected non-variant
#fisher's exact test?