-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmake_datasets+datasamples_fromGEO.py
1162 lines (953 loc) · 41.9 KB
/
make_datasets+datasamples_fromGEO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# The goal of this script is to generate the three tables of metadata for our expression data directly from GEO...(without use of Janos's script)
# -File
# -Dataset
# -DataSample
#
#
# To run this script, you will need Biopython on your local machine (or wherever you're running this script from)
# For downloading the package, see http://biopython.org/wiki/Download
# For installation instructions, see http://biopython.org/DIST/docs/install/Installation.html
#
# INPUT: two dates--the range of PDATs you want GEO to look in for yeast datasets
# OUTPUT: two tsv files--the DATASET table and the DATASAMPLE table
from Bio import Entrez
import re
import sys
import getopt
import urllib2
import time
from dateutil.parser import parse
# pass in e-mail as argument
#Entrez.email = raw_input('Please enter you e-mail address:') # sys.argv[1] #
MAX_SEARCH = 5000
MAX_SUMMARIES = 10
WAIT_TIME = 4
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# VARIABLES
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# =======
# STRAINS
# =======
strains = {
'S288C': 'S288C', 'S288c': 'S288C', 's288c': 'S288C',
'BY4741': 'S288C', 'BY4742': 'S288C', 'BY4743': 'S288C',
'BY4744': 'S288C', 'BY4745': 'S288C', 's288C': 'S288C',
'DBY12020': 'S288C', 'DBY12021': 'S288C',
'CEN.PK': 'CEN.PK', 'CEN.PK2': 'CEN.PK',
'D273-10B': 'D273-10B', 'FL100': 'FL100', 'JK9-3d': 'JK9-3d',
'RM11-1a': 'RM11-1a', 'SEY6210': 'SEY6210', 'Sigma1278b': 'Sigma1278b',
'SK1': 'SK1',
'W303': 'W303', 'W303-1a': 'W303', 'W303-1A': 'W303',
'W303-K6001': 'W303', 'W303-1b': 'W303', 'W303-1B': 'W303',
'X2180-1A': 'X2180-1A', 'Y55': 'Y55'}
# ================
# OBI TERMS--ASSAY
# ================
geo2obi_assay = {
'Other': 'OBI:0000070',
'Genome binding/occupancy profiling by array': 'OBI:0001954',
'Genome binding/occupancy profiling by genome tiling array': 'OBI:0001419',
'Genome binding/occupancy profiling by high throughput sequencing': 'OBI:0000716',
'Genome variation profiling by array': 'OBI:0001393',
'Genome variation profiling by genome tiling array': 'OBI:0002030',
# 'Genome variation profiling by genome tiling array':'OBI:0001274',
'Genome variation profiling by SNP array': 'OBI:0002031',
'Expression profiling by array': 'OBI:0001463',
'Expression profiling by genome tiling array': 'OBI:0001235',
'Expression profiling by high throughput sequencing': 'OBI:0001271',
'Methylation profiling by high throughput sequencing': 'OBI:0001266',
'Non-coding RNA profiling by high throughput sequencing': 'OBI:0001271',
'Third-party reanalysis': 'OBI:0200000'}
obi2name_assay = {
'NTR(GSE14217)': 'synthetic lethality analysis with microarrays',
'NTR(GSE1753)': 'synthetic lethality analysis with microarrays',
'NTR(GSE1838)': 'synthetic lethality analysis with microarrays',
'NTR(GSE20961)': 'competitive growth assay analysis with microarrays',
'NTR(GSE46145)': 'competitive growth assay analysis with microarrays',
'OBI:0000070': 'assay',
'OBI:0000438': 'DNA sequence variation detection',
'OBI:0000626': 'DNA sequencing',
'OBI:0000716': 'ChIP-seq assay',
'OBI:0001235': 'transcription profiling by tiling array assay',
'OBI:0001247': 'genotyping by high throughput sequencing assay',
'OBI:0001248': 'ChIP-chip assay',
'OBI:0001266': 'DNA methylation profiling by high throughput sequencing assay',
'OBI:0001271': 'RNA-seq assay',
'OBI:0001274': 'genotyping by array assay',
'OBI:0001318': 'proteomic profiling by array assay',
'OBI:0001332': 'DNA methylation profiling by array assay',
'OBI:0001393': 'comparative genomic hybridization by array assay',
'OBI:0001419': 'ChIP-chip by tiling array assay',
'OBI:0001463': 'transcription profiling by array assay',
'OBI:0001858': 'cross-linking immunoprecipitation high-throughput sequencing assay',
'OBI:0001924': 'micrococcal nuclease digestion followed by high throughput sequencing assay',
'OBI:0001954': 'ChIP assay',
'OBI:0002030': 'genotyping by tiling array',
'OBI:0002031': 'genotyping by SNP array',
'OBI:0002036': 'array based nucleic acid structure mapping assay',
'OBI:0200000': 'data transformation'}
# ====================
# OBI TERMS--BIOSAMPLE
# ====================
geo2obi_biosample = {
'nuclear RNA': 'OBI:0000862',
'nuclear RNA extract': 'OBI:0000862',
'polyA RNA': 'OBI:0000869',
'polyA RNA extract': 'OBI:0000869',
'cytoplasmic RNA': 'OBI:0000876',
'cytoplasmic RNA extract': 'OBI:0000876',
'protein extract': 'OBI:0000894',
'total RNA': 'OBI:0000895',
'total RNA extract': 'OBI:0000895',
'genomic DNA': 'OBI:0001051',
'DNA extract': 'OBI:0001051',
'other': 'OBI:0000423'} # this is handling all the "else" cases
obi2name_biosample = {
'OBI:0000862': 'nuclear RNA extract',
'OBI:0000869': 'polyA RNA extract',
'OBI:0000876': 'cytoplasmic RNA extract',
'OBI:0000894': 'protein extract',
'OBI:0000895': 'total RNA extract',
'OBI:0001051': 'DNA extract',
'OBI:0000423': 'extract'}
# ======
# TAXONS
# ======
yeasts = {
'Saccharomyces cerevisiae': '4932', 'Saccharomyces bayanus': '4931',
'Saccharomyces kluyveryii': '4934', 'Saccharomyces kudriavzevii': '114524',
'Saccharomyces mikatae': '114525', 'Saccharomyces paradoxus': '27291',
'Saccharomyces uvarum': '230603', 'Saccharomyces castellii': '27288',
'Candida albicans': '5476', '[Candida] glabrata': '5478',
'Debaryomyces hansenii': '4959', 'Kluyveromyces lactis': '28985',
'Kluyveromyces waltii': '4914', 'Lachancea kluyveri': '4934',
'Lachancea waltii': '4914', 'Naumovozyma castellii': '27288',
'Schizosaccharomyces japonicus': '4897', 'Schizosaccharomyces octosporus': '4899',
'Schizosaccharomyces pombe': '4896', 'Yarrowia lipolytica': '4952'}
# strains for saccharomyces cerevisiae all map to 4932
sc_strains = {
'Saccharomyces cerevisiae S288c': '4932', 'Saccharomyces cerevisiae BY4741': '4932',
'Saccharomyces cerevisiae S288C': '4932', 'Saccharomyces cerevisiae SK1': '4932',
'Saccharomyces cerevisiae Sigma1278b': '4932', 'Saccharomyces cerevisiae W303': '4932'}
# yeast crosses--we can't just search for "Saccharomyces cerevisiae" since it'll include some of these guys too...
yeast_crosses = {
'Saccharomyces cerevisiae x Saccharomyces kudriavzevii': '332112',
'Saccharomyces cerevisiae x Saccharomyces paradoxus': '595493',
'Saccharomyces cerevisiae x Saccharomyces uvarum': '489137',
'Saccharomyces paradoxus x Saccharomyces uvarum': '1911234'}
# non-yeast orgs
various_orgs = {
'Homo sapiens': '9606', 'Mus musculus': '10090',
'Caenorhabditis elegans': '6239', 'Drosophila melanogaster': '7227',
'Neurospora crassa': '5141', 'Escherichia coli': '562',
'Pseudomonas putida': '303', 'Synechococcus sp. WH 8102': '84588'}
# viruses--we can't just search for "Saccharomyces cerevisiae" since it'll include some of these guys too...
viruses = {
'Lachancea kluyveri NRRL Y-12651': '4934',
'Saccharomyces cerevisiae killer virus M1': '12450',
'Saccharomyces 20S RNA narnavirus': '186772',
'Saccharomyces 23S RNA narnavirus': '198599',
'Saccharomyces cerevisiae virus L-A': '11008',
'Saccharomyces cerevisiae virus L-BC (La)': '42478'}
# strains of other orgs and other weird cases
iffy_ones = {
'Neurospora crassa OR74A': '5141',
'synthetic construct': '32630',
'Synthetic plasmid': '1440148',
'Cryptococcus neoformans var. grubii': '178876'}
taxons2id = {}
taxons2id.update(yeasts)
taxons2id.update(sc_strains)
taxons2id.update(yeast_crosses)
taxons2id.update(various_orgs)
taxons2id.update(viruses)
taxons2id.update(iffy_ones)
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# METHODS
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# generate_dataset_datasample_tables()
# build_series()
# build_samples()
# generate Dataset and DataSample tables directly from GEO...(without use of Janos's script)
# @param date_start Search GEO records from after this month. Format YYYY/MM.
# @param date_end Search GEO records from before this month. Format YYYY/MM.
def generate_dataset_datasample_tables(date_start, date_end, ext_str, path):
gse2dataset_lns = {}
gsm2datasample_lns = {}
# Build Search with the following criteria:
# -TYPE is gse
# -ORGANISM is Saccharomyces cerevisiae
# -PUBLIC DATE is between 'date_start' and 'date_end'
gse_term = 'Saccharomyces cerevisiae[ORGN]&%s:%s[PDAT]&gse[ETYP]' % (
date_start, date_end)
gse_idlist = search_gds(gse_term)
print gse_idlist
# Retrieve Summaries for GSE search results ids
gse_summaries = retrieve_summaries(gse_idlist)
# For each GSE series object....
for gse_summary in gse_summaries:
print '<><><><><>%s<><><><><>' % ('GSE'+gse_summary['GSE'])
print u'(%s)' % gse_summary[u'Id']
# for k,v in gse_summary.items():
# print (k,v)
# Build Search to get GSM-typed objects associated with current GSE accn
gsm_term = '(%s[ACCN]&gsm[ETYP])' % ('GSE'+gse_summary['GSE'])
gsm_idlist = search_gds(gsm_term)
# CHECK that number of samples match metadata
gse_accn = 'GSE%s' % gse_summary['GSE']
if len(gsm_idlist) != gse_summary['n_samples']:
print 'CURATOR ACTION: check samples--number of samples found with Entrez search does not match GEO-recorded number of samples for series (%s)' % gse_accn
print '\tEntrezSearch=%d, GEOrecord=%d' % (
len(gsm_idlist), gse_summary['n_samples'])
# Retrieve Summaries for GSM search results ids
gsm_summaries = retrieve_summaries(gsm_idlist)
# Add GSM entries to the DataSample Table
gsm_lns, gse_channel_ct = build_samples(gsm_summaries, gse_summary)
# Add the GSE string for the DataSet Table
gse_dict = build_series(gse_summary, gse_channel_ct)
gse2dataset_lns.update(gse_dict)
# Don't add to sample table if this series is a superseries
if is_superseries(gse_summary) == 1:
continue
gsm2datasample_lns.update({gse_accn: gsm_lns})
# Add parent series to dataset_lns
gse2dataset_lns = add_parent_series(gse2dataset_lns)
# Dict of lines to List and sort
dataset_lns = []
datasample_lns = []
sorted_gse = gse2dataset_lns.keys()
sorted_gsm = gsm2datasample_lns.keys()
for gse in sorted_gse:
dataset_lns.append(gse2dataset_lns[gse])
for gsm in sorted_gsm:
datasample_lns.append(gsm2datasample_lns[gsm])
# issues b/c unicode formatting
# dataset_lns = gse2dataset_lns.values()
# datasample_lns = gsm2datasample_lns.values()
# dataset_lns.sort()
# datasample_lns.sort()
# Write lines to file
write_lines(path+'/dataset'+ext_str.replace("/", "-")+'.tsv', dataset_lns)
write_lines(path+'/datasample'+ext_str.replace("/", "-") +
'.tsv', datasample_lns)
#print_tables(dataset_lns, datasample_lns)
# ======
# SERIES
# ======
# Collect GSEs and GSMs separately...
# -add GSEs to Dataset lists
# -check GSM list for any samples related to GSEs not in first list
# -print the missed GSEs and add to gse_list...
# This bundles the data collection for the DataSeries table--given a GSE summary, return the formatted string for adding to the DataSeries table
# @param gse_summary dictionary Entrez summary for a given GSE object
# @return: gse_ln string Formatted row for Dataset Table
# called in the generate_dataset_datasample_tables() method
def build_series(gse_summary, channel_ct): # (gse_id)
global taxons2id
gse_ln = ''
# used for pmids, lab_name, and lab_location
pmid_lst = [str(i) for i in gse_summary['PubMedIds']]
# DataSample Table Info
gse_accn = 'GSE%s' % gse_summary['GSE']
title = gse_summary['title'] # unicode line
source = 'GEO'
ref_type = 'GEO'
date_public = parse(gse_summary['PDAT']).strftime('%Y-%m-%d')
parent_dataset = ''
if is_superseries(gse_summary) == 1:
parent_dataset = '*'
# Use helper method to find appropriate obi term
assay_name, assay_id = get_assay(gse_summary)
# Use taxon dictionary to translate taxon names to ids
t_ids = []
gse_taxons = gse_summary['taxon'].split(';')
for t in gse_taxons:
try:
id = taxons2id[t.strip()]
# avoids listing same taxon twice (e.g. "Saccharomyces cerevisiae;Saccharomyces cerevisiae SK1")
if id not in t_ids:
t_ids.append(id)
except KeyError:
print 'MISSING TAXON: %s' % t.strip()
print 'CURATOR ACTION: update the taxon dictionary in this script to include the above missing taxon'
t_ids.append('*')
taxon_id = '|'.join(t_ids)
sample_ct = str(gse_summary['n_samples'])
is_in_spell = '0'
is_in_browser = '0'
series_summary = gse_summary['summary'] # unicode line
# call helper method that retrieves from db=pubmed
lab_name, lab_location = get_lab_info(pmid_lst, gse_accn)
# Leave blank--manually filled in by curators
keywords = ''
pmids = '|'.join(pmid_lst)
# Leave blank--manually filled in by curators
display_name = ''
# 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=%s' % gse_accn
obj_url = gse_summary['FTPLink']
url_type = 'GEO'
gse_ln = u'%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (
gse_accn, title, source, gse_accn, ref_type, date_public, parent_dataset, assay_name, assay_id, taxon_id, channel_ct, sample_ct, is_in_spell, is_in_browser, series_summary, lab_name, lab_location, keywords, pmids, display_name, obj_url, url_type)
print gse_ln
return {gse_accn: gse_ln}
# =======
# SAMPLES
# =======
# This bundles the data collection for the DataSamples table into strings formatted according to specifications
# @param: gsm_summaries listofdictionaries List of Entrez summaries for the samples of a given GSE object
# @param: gse_summary dictionary Entrez summaries for the given GSE object
# @return: gse_lns listofstrings Formatted rows for DataSample Table associated with a given GSE object
# called in the generate_dataset_datasample_tables() method
def build_samples(gsm_summaries, gse_summary):
gsm2lns = {}
gsm_lines = ''
gse_channels = []
gse_channel_ct = '1'
# Two ways of assigning sample_order:
# A) by the order gse_summary returns them
# B) by ascending GSM-values
# Edith has chosen the B) option...
# first, sort samples into ascending order
sample_list = []
for sample in gse_summary['Samples']:
sample_list.append(sample['Accession'][3:])
sample_list.sort()
gse_accn = 'GSE%s' % gse_summary['GSE']
for gsm in gsm_summaries:
#print gsm
# for km,vm in gsm.items():
# print (km,vm)
gsm_display_name = gsm['title']
gsm_description = gsm['summary']
gsm_accn = gsm['Accession']
gsm_ref_type = 'GEO'
# Use helper method to find appropriate obi term
gsm_biosample_name, gsm_biosample_id = get_biosample(gsm_accn)
# STRAIN_INFO
gsm_strain = get_strain(
gsm_accn, gsm_description, gse_summary['taxon'])
#print 'Strain:%s' % gsm_strain
# SAMPLE_ORDER--use placeholder for now: '****' will be replaced after loop finishes
#gsm_sample_order = str( sample_list.index(gsm_accn[3:]) + 1 )
gsm_sample_order = '****'
gsm_ln = u'%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (
gse_accn, gsm_display_name, gsm_description, gsm_accn, gsm_ref_type, gsm_biosample_name, gsm_biosample_id, gsm_strain, gsm_sample_order)
gsm2lns.update({gsm_accn: gsm_ln})
#print '<><><><><><><><><><><><><><><>'
if 'channel 2' in gsm_description:
gse_channels.append(gsm_description)
gsm_accessions = gsm2lns.keys()
gsm_accessions.sort()
order_count = 1
for gsm in gsm_accessions:
gsm_lines = gsm_lines + \
gsm2lns[gsm].replace('\t****\n', '\t%d\n' % order_count)
order_count += 1
print gsm_lines
# return 1 or 2 based on channel count
if len(gse_channels) == len(gsm_summaries):
print 'all have channel=2'
return gsm_lines, '2'
elif len(gse_channels) == 0:
print 'all have channel=1'
return gsm_lines, '1'
# if conflicting 1 and 2 within a series, we will check if it's a superseries (just set as 2), otherwise tag for curator review
if is_superseries(gse_summary) == 1:
return gsm_lines, '2'
print 'CURATOR REVIEW: this series may contain samples with differing numbers of channels (%s)' % gse_accn
print '%d of %d' % (len(gse_channels), len(gsm_summaries))
return gsm_lines, 'CURATOR REVIEW'
# =======
# HELPERS
# =======
# write_lines()
# print_tables()
# Helper method for writing lines to a file. Formats to unicode so we can use the special characters in our database.
# @param fileout string filename to write strings to
# @param wlines listofStrings strings to write to the file
# @return void
# called by generate_dataset_datasample_tables
def write_lines(fileout, wlines):
writer = open(fileout, 'w')
writer.write(u'')
for line in wlines:
print line
writer.write(line.encode('utf8'))
writer.close()
# De-bugging/display purposes
# Print results so I don't have to open the file to see obvious errors...
def print_tables(dataset_lns, datasample_lns):
print '<><><><><><><><><><> DATASET TABLE <><><><><><><><><><>'
for l in dataset_lns:
print l
print '<><><><><><><><><> DATASAMPLE TABLE <><><><><><><><><><>'
for l in datasample_lns:
print l
# ==============
# ENTREZ HELPERS
# ==============
# search_gds()
# retrieve_summaries()
# Retrieve the uids for the search term input for use with the retrieve_summaries method (see below).
# @param: search_term string Entrez search string used
# @return: idlist list Uids from results under the 'IdList' field
def search_gds(search_term):
search_handle = Entrez.esearch(
db='gds', retmax=MAX_SEARCH, term=search_term)
idlist = Entrez.read(search_handle)['IdList']
print '# results found:%d' % len(idlist)
# Throw Warning if number of results meets the default cap.
if len(idlist) == MAX_SEARCH:
print 'Warning: search results max reached. possibly more results not captured by this script.\n\t Try running with a smaller date range'
return idlist
# Formats idlist and runs eutils to retrieve object summaries, only the designated maximum at a time (MAX_SUMMARIES). Slowest part about this script. This runs very slow because of the frequent programmed delays but this method prevents the api from throwing ""HTTP Error 503: Service Temporarily Unavailable" errors.
# @param: idlist list Uids from search results
# @return: summaries list Summary objects returned by Entrez
def retrieve_summaries(idlist):
summaries = []
idlist_str = [str(i) for i in idlist]
idlist_len = len(idlist_str)
iterations = len(idlist)/MAX_SUMMARIES + 1
for i in range(iterations):
end = idlist_len-(i*MAX_SUMMARIES)
beg = idlist_len-(i*MAX_SUMMARIES)-MAX_SUMMARIES
if beg < 0:
beg = 0
idstring = ','.join(idlist_str[beg:end])
if idstring == '':
continue
try:
summary_handle = Entrez.esummary(db='gds', id=idstring)
summaries.extend(Entrez.read(summary_handle))
except urllib2.HTTPError:
print "Error fetching", idlist
# we have angered the API! Try waiting longer?
time.sleep(WAIT_TIME)
try:
summary_handle = Entrez.esummary(db='gds', id=idstring)
summaries.extend(Entrez.read(summary_handle))
except:
print "Error fetching(second try)", idlist
# we have angered the API! Try waiting longer?
time.sleep(WAIT_TIME)
try:
summary_handle = Entrez.esummary(db='gds', id=idstring)
summaries.extend(Entrez.read(summary_handle))
except:
print "Error fetching(third try)", idlist
# wait and try once more...
time.sleep(WAIT_TIME)
summary_handle = Entrez.esummary(db='gds', id=idstring)
summaries.extend(Entrez.read(summary_handle))
time.sleep(WAIT_TIME)
return summaries
# @param string GSE or GSM accession
# @return string HTML text from the GEO page
def get_html(geo_accn):
opened_url = urllib2.urlopen(
'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=' + geo_accn)
html_txt = opened_url.read()
return html_txt
# =============
# "GET" HELPERS
# =============
# get_lab_info()
# get_assay()
# get_biosample()
# get_strain()
# is_superseries()
# Retrieve the information fields that can be found in PubMed.
# @param pmid_lst list All pmids associated with a GSE summary entry cast as string-types
# @return lab_name str LastAuthor of PubMed citation's AuthorList. Field in Dataset Table.
# @return lab_location str Affiliation of the LastAuthor object in the PubMed citation. Field in Dataset Table.
# Important notes:
# -IGNORING MULTIPLE PMIDS FOR NOW...for GSE with multiple PMIDs, we are just taking the lab location and lab name info from one of them.
# -for lab name, I am assuming last name in author list of efetch results is the same as last author results for esummary
# -Alternative for those without pmids: scrape html for 'Contributors' section...see https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE76117
# -Taking first affiliation listed for now...
# -Formatting of lab_location is still a little raw and may require some further adjustment/trimming...
def get_lab_info(pmid_lst, gse_accn):
lab_name = u''
lab_location = u''
# if no publication associated, can't fill in lab_location and lab_name
if len(pmid_lst) == 0:
# return u'',u''
html_txt = get_html(gse_accn)
# LAB_LOCATION
lab_location = re.findall(
'<tr bgcolor="#eeeeee" valign="top"><td nowrap>Organization name</td>\n<td style="text-align: justify">(.+)<br></td>', html_txt)[0].decode('utf-8')
else:
if len(pmid_lst) > 1:
print 'CURATOR NOTE: multiple PMIDs for this series so this script is arbitrarily using the first one (PMIDs=%s)' % ','.join(
pmid_lst)
# grab first pmid
pmid = pmid_lst[0]
# get record from PubMed
summary_handle = Entrez.efetch(db='pubmed', id=pmid, retmode='xml')
record = Entrez.read(summary_handle)['PubmedArticle'][0]
# LAB_NAME
# build lab_name field using LastName and Initials of the last AuthorList entry. Formatted to be consistent with previously collected metadata
last_name = record['MedlineCitation']['Article']['AuthorList'][-1]['LastName']
# print last_name
# print type(last_name)
initials = record['MedlineCitation']['Article']['AuthorList'][-1]['Initials']
# print initials
# print type(initials)
lab_name = u'%s %s' % (last_name, initials)
# print lab_name
# print type(lab_name)
# CODE: may need to cap string size for lab affiliation.
# LAB_LOCATION
# If no affiliations, set as empty list.
# If 1 or greater affiliations, set as first affiliation listed.
# If greater than 1 affiliations, also print warning to user about multiple affiliations listed so curator can fix if needed.
lab_location = u''
no_affiliations = len(
record['MedlineCitation']['Article']['AuthorList'][-1]['AffiliationInfo'])
if no_affiliations >= 1:
lab_location = record['MedlineCitation']['Article']['AuthorList'][-1]['AffiliationInfo'][0]['Affiliation']
if no_affiliations > 1:
print 'CURATOR NOTE: multiple affiliations for this author so this script is arbitrarily using the first one\n\t(PMID=%s)' % pmid
# print lab_name
# print type(lab_name)
#
# print lab_location
# print type(lab_location)
#
return lab_name, lab_location
# GEO to OBI translation for assay information in two columns of the Datasample files
# @param string gse_summary is the GSE summary
# @return tuple (obi_names, obi_ids) should return a tuple
# obi_names is a string of |-delimited assay term names and
# obi_ids is a string of |-delimited assay term ids
# called in the build_series()
def get_assay(gse_summary):
geo_ln = gse_summary['gdsType']
global geo2obi_assay
global obi2name_assay
obi_ids = []
obi_names = []
# for each geo string that describes an assay (;-delimited)...
geo_assays = geo_ln.split(';')
for string in geo_assays:
str = string.strip()
# if it is in the geo/obi assay dictionaries...
if str in geo2obi_assay.keys():
# translate it to an obi term and add the respective term name and ids to the obi_ids and obi_names lists
id = geo2obi_assay[str]
obi_ids.append(id)
obi_names.append(obi2name_assay[id])
# if the geo assay string was not recognized, print message for how it should be added to the geo2obi_assay dictionaries at the top of this script
else:
print 'CURATOR ACTION: Update GEO assay dictionary to include the following assay--%s' % str
obi_ids.append('CURATOR REVIEW')
obi_names.append('CURATOR REVIEW:'+str)
return '|'.join(obi_names), '|'.join(obi_ids)
# GEO webpage html scraping for biosample information in two columns of the Datasample files
# @param string gsm_accn is the GSM accession string
# @return tuple (obi_name, obi_id) should return a tuple of the biosample term name and id
# called in build_samples()
def get_biosample(gsm_accn):
global geo2obi_biosample
global obi2name_biosample
# retrieve html text from webpage
html_txt = get_html(gsm_accn)
# grab 50 characters following "Extracted molecule" which should be where the biosample has been formatted into the html
#html_chunk = html_txt.split('Extracted molecule</td>\n')[1][:50]
# Use RegEx to find extracted molecule chunks
extracted_molecule = ','.join(re.findall(
'<td nowrap>Extracted molecule</td>\n<td>(.+)</td>', html_txt))
# store all possible biosamples (strings that match any of the known geo2obi_biosample keys)
biosamples_column = []
for mol in extracted_molecule.split(','):
if mol in geo2obi_biosample.keys():
biosamples_column.append(geo2obi_biosample[mol])
# unique the items in biosamples_column
biosamples_column = list(set(biosamples_column))
# if one biosample is found, simply return tuple of obi_term name and obi_term id...
if len(biosamples_column) == 1:
return obi2name_biosample[biosamples_column[0]], biosamples_column[0]
# if no biosamples were found, curator may need to add/update the geo2obi_biosample and obi2name_biosample dictionaries at the top of this script
elif len(biosamples_column) == 0:
print '\nCURATOR ACTION: Biosample Dictionary in need of update <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
print extracted_molecule
return '', ''
# if multiple biosample hits, curator will have to go in and figure out what it is and what to do with it...(typo in record or maybe fix script to handle case?)
else: # if len(potential_biosamples)>1:
name = 'CURATOR REVIEW:%s <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<multiple "extracted molecules" found' % (
','.join(biosamples_column))
return name, 'CURATOR REVIEW'
# This helper method uses several approaches to try and determine the appropriate strain to assign the datasample this includes (A)taking a look at the taxon to see if it specifies a specific strain and (B)searching the description for known strain names (based on strains dictionary at the top of the script)
# @param string gsm_accn
# @param string gsm_description
# @param string gse_taxon
# @return either (1) a string of the strain name, (2) a "CURATOR REVIEW" message with multiple possible strains (for curator to check manually), or (3) a "CURATOR REVIEW" message declaring none found (curator can double-check)
# called in the build_samples() method
def get_strain(gsm_accn, gsm_description, gse_taxon):
global strains
gsm_strains = []
# (A)-prep
taxons = gse_taxon.split(';')
# (B)-prep
description_tokens = gsm_description.replace('_', ' ').split()
for k in strains.keys():
# (A) use strain taxon info
if 'Saccharomyces cerevisiae ' + k in taxons and strains[k] not in gsm_strains:
gsm_strains.append(strains[k])
# (B) use gsm esummary
# this method for identifying strains may require further development...
if k in description_tokens and strains[k] not in gsm_strains:
gsm_strains.append(strains[k])
if len(gsm_strains) == 1:
return gsm_strains[0]
# if multiple found
elif len(gsm_strains) > 1:
review_str = 'CURATOR REVIEW: multiple found in description summary/gse taxons.--Check if strain is %s' % (
' or '.join(gsm_strains))
print review_str
return review_str
# if none found
return ''
# This helper method just returns a binary 0/1--if the GSE is a superseries or not...
# called in generate_dataset_datasample_tables(), build_series(), and build_samples()
def is_superseries(gse_summary):
gse_accn = 'GSE%s' % gse_summary['GSE']
if 'SuperSeries' in gse_summary['summary']:
# print 'CURATOR ACTION: SuperSeries found so manual edits required %s' % (gse_accn)
return 1
return 0
# html---scrape for...
# (1) superseries parents
# (2) for lab location and name for series w/o pmid associated
# (3) improve strain search by adding "characteristics" section (format "strain: %s" % strain_name)
#
def add_parent_series_via_superseries(dataset_lns):
parent2sub = {}
for gse_accn in dataset_lns.keys():
# retrieve html text from webpage
html_txt = get_html(gse_accn)
# Grab chunk of text related to Superseries relationships
start = html_txt.find(
'<tr valign="top"><td colspan="2">This SuperSeries is composed of the following SubSeries:</td></tr>')
end = html_txt.find(
'<tr valign="top"><td colspan="2"><strong>Relations</strong></td></tr>')
if start == -1:
continue
html_chunk = html_txt[start:end]
html_chunk.split('geoaxema_recenter)">GSE')[1:]
print html_chunk
subseries = list(set(re.findall('GSE[0-9]{3,6}', html_chunk)))
parent2sub.update({gse_accn: subseries})
for parent in parent2sub.keys():
for subseries in parent2sub[parent]:
dataset_tokens = dataset_lns[subseries].split('\t')
dataset_tokens[6] = parent
dataset_lns[subseries] = '\t'.join(dataset_tokens)
return dataset_lns
# HTML scrape for superseries information and input into appropriate subseries column
# @param dict gse2dataset_lns
# @return dict the updated version with parent series added to the appropriate
def add_parent_series(dataset_lns):
# Generate subseries to parents dictionary
sub2parent = {}
for gse_accn in dataset_lns.keys():
# retrieve html text from webpage
html_txt = get_html(gse_accn)
# Grab chunk of text related to SuperSeries/SubSeries relationships (and ignore if there isn't any)
html_chunk = ''.join(re.findall(
'<tr valign="top"><td colspan="2">This SubSeries is part of SuperSeries:</td></tr>(.+)<tr valign="top"><td colspan="2"><strong>Relations</strong></td></tr>', html_txt, re.DOTALL))
# grab any GSEs within this chunk and assign in sub2parent dictionary
parents = list(set(re.findall('GSE[0-9]{2,6}', html_chunk)))
# if series doesn't have a parent, skip
if len(parents) == 0:
continue
# update dictionary of subseries to add parents for
sub2parent.update({gse_accn: parents})
# Update the dataset_lns dictionary
for subseries in sub2parent.keys():
dataset_tokens = dataset_lns[subseries].split('\t')
dataset_tokens[6] = '|'.join(sub2parent[subseries])
dataset_lns[subseries] = '\t'.join(dataset_tokens)
return dataset_lns
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
def retrieve_display_names(pmids):
pmid2display_names = {}
idstring = ','.join(pmids)
summary_handle = Entrez.efetch(db='pubmed', id=idstring, retmode='xml')
records = Entrez.read(summary_handle)['PubmedArticle']
for record in records:
pmid = record['MedlineCitation']['PMID']
author = record['MedlineCitation']['Article']['AuthorList'][0]['LastName']
year = record['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']
display_name = '%s_%s_PMID_%s' % (author, year, pmid)
pmid2display_names.update({pmid: display_name.decode('utf8')})
return pmid2display_names
# Based on this test, I have determined that the maximum number of summaries I can retrieve at once using the idlist is 80.
# Thus, I am editing this script so the helper method only retrieves 80 at a time but returns a full list of summaries so that I don't have to adjust the whole script (just the helper)
# Update: the maximum number changes as you use it...I have switched to retrieving 5 or so at a time (see MAX_SUMMARIES constant)
def test_max():
term = '(GSE71490[ACCN]&gsm[ETYP])'
idlist = search_gds(term)
idlist_str = [str(i) for i in idlist]
cur_len = 60
for i in range(500):
print 'length: %d' % cur_len
idstring = ','.join(idlist_str[:cur_len])
print idstring
summary_handle = Entrez.esummary(db='gds', id=idstring)
summaries = Entrez.read(summary_handle)
print 'found summaries!'
cur_len += 1
return summaries
#esearch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds...'
#efetch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gds...'
# def test_range(some_int):
#
# lst = []
# for i in range(some_int):
# lst.append( i )
# print 'length: %d\n' % len(lst)
# print lst
#
# summaries = []
# lst = [str(i) for i in lst]
#
# iterations = len(lst)/80 + 1
#
# lst_len = len(lst)
# for i in range(iterations):
# print 'i=%d' % i
#
# end = lst_len-(i*80)
# beg = lst_len-(i*80)-80
#
# if beg < 0:
# beg = 0
# id_lst = lst[beg:end]
#
# print id_lst
# print len(id_lst)
# summaries.extend(id_lst)
# print summaries
# print len(summaries)
# def test_get_lab_info():
# get_lab_info([],'GSE76716')
# get_lab_info([],'GSE86279')
# get_lab_info([],'GSE76715')
# get_lab_info(['26895426'])
# ...For dealing with case of empty AffiliationInfo list:
# get_lab_info(['25425491'])
# def test_add_parent_series():
# dataset_lns = {'GSE76778':'0\t1\t2\t3\t4\t5\t6\t7\t8', 'GSE76715':'0\t1\t2\t3\t4\t5\t6\t7\t8',
# 'GSE76716':'0\t1\t2\t3\t4\t5\t6\t7\t8', 'GSE93959':'0\t1\t2\t3\t4\t5\t6\t7\t8',
# 'GSE86279':'0\t1\t2\t3\t4\t5\t6\t7\t8', 'GSE84066':'0\t1\t2\t3\t4\t5\t6\t7\t8',
# 'GSE84065':'0\t1\t2\t3\t4\t5\t6\t7\t8'}
#
# dataset_lns = add_parent_series(dataset_lns)
# print dataset_lns
def main(argv):
try:
if len(argv) > 2:
opts, args = getopt.getopt(
argv, 's:l:e:o', ['startdate=', 'lastdate=', 'email=', 'outpath='])
print 'getting opts' + str(argv)
outpath = './'
for opt, arg in opts:
arg = re.sub('=', '', arg)
if opt in ("-s", "--startdate"):
start = arg
if opt in ("-l", "--lastdate"):
enddate = arg
if opt in ("-e", "--email"):
Entrez.email = arg
if opt in ("-o", "--outpath"):
outpath = arg
out_ext = start+"to"+enddate
#Entrez.email = #raw_input('Please enter you e-mail address:') # sys.argv[1] #
# print out_ext + "," + Entrez.email + "," + start + \
# "," + enddate + "," + start.replace("-", "/"
generate_dataset_datasample_tables(start.replace(
"-", "/"), enddate.replace("-", "/"), out_ext, outpath)
# generate_dataset_datasample_tables(
# start, enddate, out_ext, outpath)
else:
print 'Usage: make_datasets+datasamples_fromGEO.py -s <YYYY-MM> -l <YYYY-MM> -e <email address> -o <outfile path>'
sys.exit(2)
except getopt.GetoptError, err:
print str(err)
print 'Usage: make_datasets+datasamples_fromGEO.py -s <YYYY-MM> -l <YYYY-MM> -e <email address> -o <outfile path>'
sys.exit(2)
# t0 = time.time()
# generate_dataset_datasample_tables('2017/01','2017/01','2017-01')
# t1 = time.time()
# print 'TIME(Jan):'
# print t1-t0
# generate_dataset_datasample_tables('2017/02','2017/02','2017-02')
# t2 = time.time()
# print 'TIME(Feb):'
# print t2-t1
# generate_dataset_datasample_tables('2017/03','2017/03','2017-03')
# t3 = time.time()
# print 'TIME(Mar):'
# print t3-t2
# generate_dataset_datasample_tables('2017/04','2017/04','2017-04')
# t4 = time.time()
# print 'TIME(Apr):'
# print t4-t3
# generate_dataset_datasample_tables('2017/05','2017/05','2017-05')
# t5 = time.time()
# print 'TIME(May):'
# print t5-t4
# TIME(Jan): # (25)
# 568.598527908
# TIME(Feb): # (15)
# 378.935008049
# TIME(Mar): # (15)
# 306.390367031
# TIME(Apr): # (20)
# 732.77582407
# TIME(May): # (26)