Skip to content

Commit 9041fb9

Browse files
chg: Move io functions to io module
1 parent cda8323 commit 9041fb9

File tree

5 files changed

+363
-603
lines changed

5 files changed

+363
-603
lines changed

deep_reference_parser/io/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
from .io import read_jsonl, write_jsonl
1+
from .io import (load_tsv, read_jsonl, read_pickle, write_jsonl, write_pickle,
2+
write_to_csv, write_tsv)

deep_reference_parser/io/io.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,74 @@
66
"""
77

88
import json
9+
import pickle
10+
import csv
11+
import os
12+
import pandas as pd
913

1014
from ..logger import logger
1115

16+
def _split_list_by_linebreaks(tokens):
17+
"""Cycle through a list of tokens (or labels) and split them into lists
18+
based on the presence of Nones or more likely math.nan caused by converting
19+
pd.DataFrame columns to lists.
20+
"""
21+
out = []
22+
tokens_gen = iter(tokens)
23+
while True:
24+
try:
25+
token = next(tokens_gen)
26+
if isinstance(token, str) and token:
27+
out.append(token)
28+
else:
29+
yield out
30+
out = []
31+
except StopIteration:
32+
if out:
33+
yield out
34+
break
35+
36+
def load_tsv(filepath, split_char="\t"):
37+
"""
38+
Load and return the data stored in the given path.
39+
40+
Expects data in the following format (tab separations).
41+
42+
References o o
43+
o o
44+
1 o o
45+
. o o
46+
o o
47+
WHO title b-r
48+
treatment title i-r
49+
guidelines title i-r
50+
for title i-r
51+
drug title i-r
52+
- title i-r
53+
resistant title i-r
54+
tuberculosis title i-r
55+
, title i-r
56+
2016 title i-r
57+
58+
59+
60+
Args:
61+
filepath (str): Path to the data.
62+
split_char(str): Character to be used to split each line of the
63+
document.
64+
65+
Returns:
66+
a series of lists depending on the number of label columns provided in
67+
filepath.
68+
69+
"""
70+
71+
df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False)
72+
out = [list(_split_list_by_linebreaks(column)) for _, column in df.iteritems()]
73+
74+
logger.info("Loaded %s training examples", len(out[0]))
75+
76+
return tuple(out)
1277

1378
def write_jsonl(input_data, output_file):
1479
"""
@@ -61,3 +126,71 @@ def read_jsonl(input_file):
61126
logger.debug("Read %s lines from %s", len(out), input_file)
62127

63128
return out
129+
130+
131+
def write_to_csv(filename, columns, rows):
132+
"""
133+
Create a .csv file from data given as columns and rows
134+
135+
Args:
136+
filename(str): Path and name of the .csv file, without csv extension
137+
columns(list): Columns of the csv file (First row of the file)
138+
rows: Data to write into the csv file, given per row
139+
"""
140+
141+
with open(filename, "w") as csvfile:
142+
wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
143+
wr.writerow(columns)
144+
145+
for i, row in enumerate(rows):
146+
wr.writerow(row)
147+
logger.info("Wrote results to %s", filename)
148+
149+
150+
def write_pickle(input_data, output_file, path=None):
151+
"""
152+
Write an object to pickle
153+
154+
Args:
155+
input_data(dict): A dict to be written to json.
156+
output_file(str): A filename or path to which the json will be saved.
157+
path(str): A string which will be prepended onto `output_file` with
158+
`os.path.join()`. Obviates the need for lengthy `os.path.join`
159+
statements each time this function is called.
160+
"""
161+
162+
if path:
163+
164+
output_file = os.path.join(path, output_file)
165+
166+
with open(output_file, "wb") as fb:
167+
pickle.dump(input_data, fb)
168+
169+
170+
def read_pickle(input_file, path=None):
171+
"""Create a list from a jsonl file
172+
173+
Args:
174+
input_file(str): File to be loaded.
175+
path(str): A string which will be prepended onto `input_file` with
176+
`os.path.join()`. Obviates the need for lengthy `os.path.join`
177+
statements each time this function is called.
178+
"""
179+
180+
if path:
181+
input_file = os.path.join(path, input_file)
182+
183+
with open(input_file, "rb") as fb:
184+
out = pickle.load(fb)
185+
186+
logger.debug("Read data from %s", input_file)
187+
188+
return out
189+
190+
def write_tsv(token_label_pairs, output_path):
191+
"""
192+
Write tsv files to disk
193+
"""
194+
with open(output_path, "w") as fb:
195+
writer = csv.writer(fb, delimiter="\t")
196+
writer.writerows(token_label_pairs)

0 commit comments

Comments
 (0)