-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSNP_filter.py
76 lines (70 loc) · 2.65 KB
/
SNP_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import argparse
import os
import pysam
def args_parser():
'''parser the arguments from terminal command'''
parser=argparse.ArgumentParser(prog = "PROG", formatter_class = argparse.RawDescriptionHelpFormatter, description="filter bad genes for each genotype. \n\
Usage: SNP_filter.py -dir <read_count> -is_ref -O ")
parser.add_argument("-dir", "--directory", help = "the directory with allelic read count of given genotype. ")
parser.add_argument("-is_ref", "--is_ref", help = "assign the genotype as reference genome. ", action = "store_true", default = False)
parser.add_argument("-O", "--output", help = "output name ")
parser.add_argument("-cutoff", "--cutoff", default = 5, help = "the percentage cutoff value to filter snp sites. ", type = int)
args = parser.parse_args()
return args
def count_sum(args):
dir = args.directory
fs = [f for f in os.listdir(dir) if f.endswith(".txt")]
f_total = []
f_ref = []
f_alt = []
f_non = []
for f in fs:
snp_allele = pd.read_table(os.path.join(dir, f), sep = "\t", header = 0, index_col = ["chrom", "variants_pos"])
snp_total = snp_allele[["total_count"]]
snp_ref = snp_allele[["ref_count"]]
snp_alt = snp_allele[["alt_count"]]
snp_non = snp_allele[["non_count"]]
f_total.append(snp_total)
f_ref.append(snp_ref)
f_alt.append(snp_alt)
f_non.append(snp_non)
total_df = pd.concat(f_total, axis = 1)
ref_df = pd.concat(f_ref, axis = 1)
alt_df = pd.concat(f_alt, axis = 1)
non_df = pd.concat(f_non, axis = 1)
###
total_sum_col = total_df.sum(axis = 1)
ref_sum_col = ref_df.sum(axis = 1)
alt_sum_col = alt_df.sum(axis = 1)
non_sum_col = non_df.sum(axis = 1)
####
df = pd.DataFrame(columns = ["total", "ref", "alt", "non"])
df["total"] = total_sum_col
df["ref"] = ref_sum_col
df["alt"] = alt_sum_col
df["non"] = non_sum_col
return df
def bad_snp(args, df):
'''Get mis-ratio of filtered genes against total-count. '''
is_ref = args.is_ref
cutoff = args.cutoff
output = args.output
#
if is_ref == True:
df["mis_ratio"] = (df["alt"] + df["non"])/df["total"] * 100
else:
df["mis_ratio"] = (df["ref"] + df["non"])/df["total"] * 100
########### cutoff for potential problematic genes set here ##############
bad_snp = df[df.mis_ratio > cutoff]
bad_snp.to_csv(output+".txt", sep = "\t", index = True)
return bad_gid
def main():
args = args_parser()
df_sum = count_sum(args)
bad_snp(args, df_sum)
##############
### Run it ###
##############
if __name__ == "__main__":
main()