8
8
A script to generate a database of refseq annotations with a row for each
9
9
probe that overlaps the region.
10
10
11
+ Changed on August 29, 2019 to include strand information for the probe.
12
+
11
13
"""
12
14
13
15
import argparse
17
19
from Bio .Seq import Seq
18
20
from Bio .Alphabet import IUPAC
19
21
import pandas as pd
22
+ import numpy as np
20
23
21
24
def check_polarity (row ):
22
25
""""Checks polarity and flips probe sequence if reference is +."""
23
- if row [14 ] == '+' :
26
+ if row [15 ] == '+' :
24
27
return str (Seq (row [3 ], IUPAC .unambiguous_dna ).reverse_complement ())
25
28
else :
26
29
return str (row [3 ])
27
30
28
31
def truncate_refseq (row ):
29
32
""""Truncates the Refseq column down to just accession."""
30
- version = row [12 ].split ('_' )[0 ] + '_' + row [12 ].split ('_' )[1 ]
33
+ version = row [13 ].split ('_' )[0 ] + '_' + row [13 ].split ('_' )[1 ]
31
34
accession = version .split ('.' )[0 ]
32
35
return accession
33
36
@@ -44,8 +47,6 @@ def getArgs(strInput=None):
44
47
required = True , help = "The name of the annotation file" )
45
48
parser .add_argument ('-o' , '--outputFile' , action = 'store' , type = str ,
46
49
required = False , help = "The name for output file" )
47
- parser .add_argument ('-f' , '--file' , action = 'store_true' , default = False ,
48
- required = False , help = "Run in file mode" )
49
50
50
51
return parser .parse_args ()
51
52
@@ -65,22 +66,21 @@ def main():
65
66
else :
66
67
out_name = args .outputFile
67
68
68
- if args .file :
69
- probes = []
70
- with open (args .folder ) as probe_file :
69
+
70
+ # create a list of all files in directory
71
+ files = glob .glob (folder + "/*" )
72
+
73
+ # creates a list of the all of the probes from each
74
+ # probe file, each represented as a single string
75
+ probes = []
76
+ for f in files :
77
+ with open (f ) as probe_file :
71
78
for line in probe_file :
72
79
probes .append (line .strip ())
73
- else :
74
- # create a list of all files in directory
75
- files = glob .glob (folder + "/*" )
76
-
77
- # creates a list of the all of the probes from each
78
- # probe file, each represented as a single string
79
- probes = []
80
- for f in files :
81
- with open (f ) as probe_file :
82
- for line in probe_file :
83
- probes .append (line .strip ())
80
+
81
+ # add strand column to probes
82
+ for i in range (0 , len (probes ), 1 ):
83
+ probes [i ] = probes [i ] + '\t +'
84
84
85
85
# creates a bedtool object with the entire probe set for the assembly
86
86
probe_bedtool = pybedtools .BedTool (probes )
@@ -111,11 +111,14 @@ def main():
111
111
# check polarity of annotations, function flips probe sequence if necessary
112
112
probes [3 ] = probes .apply (check_polarity , axis = 1 )
113
113
114
+ # flip probe strand too
115
+ probes [9 ] = np .where (probes [15 ] == '+' , '-' , '+' )
116
+
114
117
# convert refseq column to just accession
115
- probes [12 ] = probes .apply (truncate_refseq , axis = 1 )
118
+ probes [13 ] = probes .apply (truncate_refseq , axis = 1 )
116
119
117
120
# drop unnecessary columns
118
- probes .drop ([9 , 10 , 11 , 13 , 14 , 15 ],
121
+ probes .drop ([10 , 11 , 12 , 14 , 15 , 16 ],
119
122
axis = 1 ,
120
123
inplace = True )
121
124
0 commit comments