-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathENCODE_fastq_trimmer.py
171 lines (144 loc) · 6.06 KB
/
ENCODE_fastq_trimmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import argparse
import os.path
#import encodedcc
import sys
import requests
from urllib.parse import urljoin
import gzip
import subprocess
import csv
import logging
logger = logging.getLogger(__name__)
EPILOG = '''
This script will trim fastqs down to a read length specified in the TSV file
with the '--object' option
TSV format:
The TSV requires the accession and read_length of the files you want edited
If the run_type needs to be changed include the run_type
Include paired_with on the second file in the pair
EX:
accession read_length run_type paired_with
ENCFF240OHP 25 paired-ended
ENCFF521WIC 25 paired-ended ENCFF240OHP
This is a dryrun-default script and needs to be run with '--update' for
changes to occur
please note that this script assumes that the files are valid and uploads
them without running validateFile unlke ENCODE_submit_files.py
If you want the files validated or you simply want to run them through
ENCODE_submit_files then run without the '--update' option to make new files
For more details:
%(prog)s --help
'''
def getArgs():
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument('--object',
help="TSV file of accessions, desired read length and \
other metadata needed for submitting")
parser.add_argument('--key',
default='default',
help="The keypair identifier from the keyfile. \
Default is --key=default")
parser.add_argument('--keyfile',
default=os.path.expanduser("~/keypairs.json"),
help="The keypair file. Default is --keyfile=%s" % (os.path.expanduser("~/keypairs.json")))
parser.add_argument('--debug',
default=False,
action='store_true',
help="Print debug messages. Default is False.")
parser.add_argument('--update',
default=False,
action='store_true',
help="Let the script PATCH the data. Default is False")
args = parser.parse_args()
return args
def trim_file(url, connection, filename, size):
r = requests.get(url, auth=connection.auth, stream=True)
gzfile = gzip.GzipFile(fileobj=r.raw)
with gzip.open(filename, "wb") as outfile:
print("writing file {filename}".format(filename=filename))
while True:
try:
#import pdb; pdb.set_trace()
header = next(gzfile)
sequence = next(gzfile)[:-1][:size] + b'\n'
qual_header = next(gzfile)
# snip off newline, trim to size, add back newline
quality = next(gzfile)[:-1][:size] + b'\n'
outfile.write(header)
outfile.write(sequence)
outfile.write(qual_header)
outfile.write(quality)
except StopIteration:
break
def main():
args = getArgs()
key = encodedcc.ENC_Key(args.keyfile, args.key)
connection = encodedcc.ENC_Connection(key)
data = []
if args.update:
print("This is an UPDATE run. Data will be POSTed to server")
else:
print("This is a dry run. Nothing will be changed.")
if os.path.isfile(args.object):
with open(args.object, "r") as tsvfile:
reader = csv.DictReader(tsvfile, delimiter="\t")
for row in reader:
data.append(row)
else:
print("No file provided!")
if len(data) == 0:
print("No data to check!", file=sys.stderr)
sys.exit(1)
for line in data:
acc = line["accession"]
size = int(line["read_length"])
filename = acc + "_modified.fastq.gz"
link = "/files/" + acc + "/@@download/" + acc + ".fastq.gz"
url = urljoin(connection.server, link)
trim_file(url, connection, filename, size)
print("making metadata")
file_data = encodedcc.get_ENCODE(acc, connection, frame="edit")
derived_from = "/files/{}/".format(file_data["accession"])
file_data["derived_from"] = [derived_from]
unsubmittable = ['md5sum', 'quality_metrics', 'file_size',
'schema_version', 'accession', 'date_created',
'content_md5sum', 'status', 'submitted_by',
'alternate_accessions']
for item in unsubmittable:
# remove items we don't want to submit
file_data.pop(item, None)
# alter values of some items
file_data["submitted_file_name"] = filename
file_data["read_length"] = size
file_data["aliases"] = [
"j-michael-cherry:{acc}-{size}".format(acc=acc, size=size)]
# conditional items, only change under particular circumstances
if line.get("run_type"):
file_data["run_type"] = line["run_type"]
if line["run_type"] == "single-ended":
file_data.pop("paired_end", None)
if line.get("paired_with"):
# file has partner, time for fancy stuff
pair = "j-michael-cherry:{acc}-{size}".format(
acc=line["paired_with"], size=size)
file_data["paired_with"] = pair
print(file_data)
# upload file to ENCODE
file_object = encodedcc.post_file(file_data, connection, args.update)
if not file_object:
logger.warning('Skipping row %d: POST file object failed' % (acc))
continue
# post on aws
aws_return_code = encodedcc.upload_file(file_object, args.update)
if aws_return_code:
logger.warning(
'Row %d: Non-zero AWS upload return code %d' % (aws_return_code))
if args.update:
# remove file because space reasons
print("Removing file {filename}".format(filename=filename))
subprocess.call(["rm", "{filename}".format(filename=filename)])
if __name__ == '__main__':
main()