-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_cord19.py
185 lines (135 loc) · 6.59 KB
/
test_cord19.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
################################################################################
# #
# CS 7740/8740 #
# Fall 2020 - Spring 2021 #
# #
# Class Project - Test CORD-19 Corpus Reader #
# test_cord19.py #
# #
# Started: Jason James #
# 2020-9-15 #
# #
# Modified: Alex Morehead #
# 2021-4-11 #
# #
################################################################################
from cord19 import CORD19CorpusReader
# Directory of the CORD-19 corpus.
# Assumes the repository directory is sibling to / at the same level as the corpus directory.
root = '../../../Data/Archive/'
# Make a corpus reader on everything that is a .json file in the directory.
reader = CORD19CorpusReader(root, '.*\.json')
# Setup a reader that prefers PMC parses over PDF parses when both are available.
# reader = CORD19CorpusReader(root, '.*\.json', prefer_pdf_parses = False, prefer_pmc_parses = True)
# Setup a reader with both preferences set to True, should just take all the JSON files found.
# reader = CORD19CorpusReader(root, '.*\.json', prefer_pdf_parses = True, prefer_pmc_parses = True)
# Setup a reader with both preferences set to False, which should not include either when there's duplicates (which is probably not something you would typically want to do).
# reader = CORD19CorpusReader(root, '.*\.json', prefer_pdf_parses = False, prefer_pmc_parses = False)
# Setup a reader to read only titles.
# reader = CORD19CorpusReader(root, '.*\.json', include_titles = True, include_abstracts = False, include_bodies = False)
# Setup a reader to read only abstracts.
# reader = CORD19CorpusReader(root, '.*\.json', include_titles = False, include_abstracts = True, include_bodies = False)
# Setup a reader to read only bodies.
# reader = CORD19CorpusReader(root, '.*\.json', include_titles = False, include_abstracts = False, include_bodies = True)
# Let's make a variable of a string of a particular file ID.
just_one_document = 'document_parses/pmc_json/PMC7480786.xml.json'
# Let's make a list of a few file IDs.
several_documents = ['document_parses/pdf_json/0000028b5cc154f68b8a269f6578f21e31f62977.json',
'document_parses/pdf_json/0001418189999fea7f7cbe3e82703d71c85a6fe5.json',
'document_parses/pmc_json/PMC7480774.xml.json',
'document_parses/pmc_json/PMC7480786.xml.json']
print('Test fileids()')
print(reader.fileids())
print('len(reader.fileids():', len(reader.fileids()))
print()
print('Test raw() with One File ID')
print(reader.raw(just_one_document))
print()
print('Test raw() with List of File IDs')
print(reader.raw(several_documents))
print()
print('Test words() with One File ID')
print(reader.words(just_one_document))
# for word in reader.words(just_one_document):
# print(word)
print()
print('Test words() with List of File IDs')
print(reader.words(several_documents))
print()
print('Test sents() with One File ID')
print(reader.sents(just_one_document))
# for sentence in reader.sents(just_one_document)[0:5]:
# print(sentence)
print()
print('Test sents() with List of File IDs')
print(reader.sents(several_documents))
print()
print('Test paras() with One File ID')
print(reader.paras(just_one_document))
# for paragraph in reader.paras(just_one_document)[0:5]:
# print(paragraph)
# print('Test paras() with List of File IDs')
# print(reader.paras(several_documents))
# words = reader.words()
# print("Word Count:", len(words))
print('Test metadata() with One File ID')
print(reader.metadata(just_one_document))
print()
print('Test metadata() with List of File IDs')
print(reader.metadata(several_documents))
print()
print('Test metadata() for Entire Corpus')
metadata = reader.metadata()
# print(reader.metadata())
print('len(metadata.items()):', len(metadata.items()))
# # print('cord_id(8s9y5io9):', all_metadata['document_parses/pdf_json/46c6bbfc98d4485c31d8bdc82cbc8eaee4d31eb0.json'])
print()
print('Test metadata() for All Metadata')
# print(reader.metadata())
metadata = reader.metadata(fileids_only=False)
print('len(metadata.items()):', len(metadata.items()))
print()
print('Test statistics()')
reader.statistics()
print()
print('Test citations() with One File ID')
citations_list = reader.citations(just_one_document)
print(citations_list)
print()
print('Test citations() with List of File IDs')
citations_list = reader.citations(several_documents)
print(citations_list)
print()
# print('Test citations() with All File IDs')
# citations_list = reader.citations()
# print()
# print('Demo Frequency Distribution of Publish Times')
# metadata = reader.metadata()
# publish_times = []
# for (key, entry_list) in metadata.items():
# for entry in entry_list:
# publish_times.append(entry['publish_time'])
# # print(publish_times)
# frequency_distribution = nltk.FreqDist(publish_times)
# frequency_distribution.plot(50)
# print()
# print('Demo Frequency Distribution of Publish Times (fileids_only = False)')
# metadata = reader.metadata(fileids_only = False)
# publish_times = []
# for (key, entry_list) in metadata.items():
# for entry in entry_list:
# publish_times.append(entry['publish_time'])
# # print(publish_times)
# frequency_distribution = nltk.FreqDist(publish_times)
# frequency_distribution.plot(50)
# print('Demo Frequency Distribution of Most Common Words')
# stopword_list = nltk.corpus.stopwords.words('english')
# ignore_list = ['.', ',', '!', '?', '[', ']', '(', ')', '`', '"', '\'', ';', ':', '%', '-', '+', '=', '_', '),', ').', '],', '/', '\\', '.,', 'et', 'al']
# ignore_list = ignore_list + stopword_list
# document_list = reader.fileids()[0:1000]
# word_list = [word.lower() for word in reader.words(document_list) if word.lower() not in ignore_list]
# frequency_distribution = nltk.FreqDist(word_list)
# frequency_distribution.plot(50)
# print()
# vocabulary = set(words)
# print("Vocabulary Count:", len(vocabulary))