This repository was archived by the owner on Jul 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathavro_from_wikiquote.py
82 lines (67 loc) · 3.25 KB
/
avro_from_wikiquote.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import bz2
import xml.sax
import mwparserfromhell
import argparse
import avro
from avro.datafile import DataFileWriter
from avro.io import DatumWriter
parser = argparse.ArgumentParser(description="extract quotes from a Wikiquote dump")
parser.add_argument("dump_path", type=str, help="the path of the Wikiquote dump, compressed")
parser.add_argument("-output_file", type=str, help="the avro serialized file", default="utterances.avro")
args = parser.parse_args()
bz_file = bz2.BZ2File(args.dump_path)
schema = avro.schema.Parse(open("corpus_utterance.avsc", "rb").read().decode('utf-8'))
writer = DataFileWriter(open(args.output_file, "wb"), DatumWriter(), schema)
class WikiDumpHandler(xml.sax.ContentHandler):
def __init__(self):
super().__init__()
self.page_count = 0
self.current_quotes = 0
self.current_title = ''
self.current_raw_page = ''
self.current_tag = None
def characters(self, content):
if self.current_tag == 'title':
self.current_title += content
if self.current_tag == 'text':
self.current_raw_page += content
def endElement(self, name):
if name == 'title':
self.current_tag = None
if name == 'text':
# TODO use mediawiki namespaces to identify special pages. This is only for EN and IT
if self.current_title.split(':') in ['Category', 'Categoria', 'Wikiquote']:
return
self.page_count += 1
print('Article {3}: {0} - {1} characters in {2} lines, {4} quotes so far'.format(self.current_title, len(self.current_raw_page), len(self.current_raw_page.split('\n')), self.page_count, self.current_quotes))
wikicode = mwparserfromhell.parse(self.current_raw_page)
# a quote is marked by a Tag with a * as the content, then the content is inside a list of Text nodes
in_quote = False
current_quote = ''
for n in wikicode.nodes:
if in_quote:
if type(n) in [mwparserfromhell.nodes.Text, mwparserfromhell.nodes.Wikilink]:
current_quote += mwparserfromhell.parse(str(n)).strip_code().replace('\n', ' ')
if type(n) not in [mwparserfromhell.nodes.Text, mwparserfromhell.nodes.Wikilink] or len(n.replace('\n', ' ').strip()) == 0:
if len(current_quote) > 10:
writer.append(
{"text": current_quote, "source": self.current_title, "tags": []})
self.current_quotes += 1
current_quote = ''
if type(n) is mwparserfromhell.nodes.Tag and n == '*':
in_quote = True
else:
if type(n) is mwparserfromhell.nodes.Tag and n == '*':
in_quote = True
self.current_tag = None
def startElement(self, name, attrs):
if name == 'title':
self.current_tag = 'title'
self.current_title = ''
if name == 'text':
self.current_tag = 'text'
self.current_raw_page = ''
parser = xml.sax.make_parser()
parser.setContentHandler(WikiDumpHandler())
parser.parse(bz_file)
writer.close()