Skip to content

Add initial support for SpaCy and SudachiPy parsers. #193

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions morph/deps/python/parser_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import importlib.util
import json
import re
import sys

# TODO: Figure out if we can just enumerate all SpaCY modules directly.
known_spacy_models = [
('en_core_web_sm', 'SpaCy English Core Web (small)'),
('en_core_web_md', 'SpaCy English Core Web (medium)'),
('en_core_web_lg', 'SpaCy English Core Web (large)'),
('es_core_web_sm', 'SpaCy Spanish Core News (small)'),
('es_core_web_md', 'SpaCy Spanish Core News (medium)'),
('es_core_web_lg', 'SpaCy Spanish Core News (large)'),
('fr_core_news_sm', 'SpaCy French Core News (small)'),
('fr_core_news_md', 'SpaCy French Core News (medium)'),
('fr_core_news_lg', 'SpaCy French Core News (large)'),
('it_core_news_sm', 'SpaCy Italian Core News (small)'),
('it_core_news_md', 'SpaCy Italian Core News (medium)'),
('it_core_news_lg', 'SpaCy Italian Core News (large)'),
('ja_core_news_sm', 'SpaCy Japanese Core News (small)'),
('ja_core_news_md', 'SpaCy Japanese Core News (medium)'),
('ja_core_news_lg', 'SpaCy Japanese Core News (large)'),
('ro_core_news_sm', 'SpaCy Romanian Core News (small)'),
('ro_core_news_md', 'SpaCy Romanian Core News (medium)'),
('ro_core_news_lg', 'SpaCy Romanian Core News (large)'),
('zh_core_news_sm', 'SpaCy Chinese Core Web (small)'),
('zh_core_news_md', 'SpaCy Chinese Core Web (medium)'),
('zh_core_news_lg', 'SpaCy Chinese Core Web (large)'),
]

sys.stdin.reconfigure(encoding='utf-8')
sys.stdout.reconfigure(encoding='utf-8')

def dump_parsers():
parsers = []

# Check for SudachiPy
if importlib.util.find_spec('sudachipy') is not None:
parsers.append(('sudachipy:a', 'Sudachi Japanese (narrow)'))
parsers.append(('sudachipy:b', 'Sudachi Japanese (normal)'))
parsers.append(('sudachipy:c', 'Sudachi Japanese (wide)'))

# Check for spacy
if importlib.util.find_spec('spacy') is not None:
for m in known_spacy_models:
if importlib.util.find_spec(m[0]) is not None:
parsers.append(('spacy:' + m[0], m[1]))

print(json.dumps(parsers))
sys.stdout.flush()

def interactive_mode():
module = sys.argv[2].split(':')
if module[0] == 'spacy':
import spacy
nlp = spacy.load(module[1])

for line in sys.stdin:
doc = nlp(line)
if ("reading_forms" in doc.user_data):
def proc_morph(w):
reading = doc.user_data["reading_forms"][w.i]
reading = "" if reading is None else reading
return (w.lemma_, w.norm_, w.text, reading, w.pos_, '*')
result = [proc_morph(w) for w in doc]
else:
result = [(w.lemma_, w.norm_, w.text, w.lemma_, w.pos_, '*') for w in doc]
print(json.dumps(result))
sys.stdout.flush()
elif module[0] == 'sudachipy':
from sudachipy import tokenizer
from sudachipy import dictionary

if module[1] == 'a':
mode = tokenizer.Tokenizer.SplitMode.A
elif module[1] == 'b':
mode = tokenizer.Tokenizer.SplitMode.B
elif module[1] == 'c':
mode = tokenizer.Tokenizer.SplitMode.C

tokenizer_obj = dictionary.Dictionary().create()

# Exclude morphemes with alpha-numeric characters
alpha_num = re.compile('[a-zA-Z0-90-9]')

for line in sys.stdin:
try:
line = line.strip()
def proc_morph(m):
dform = m.dictionary_form()
surf = m.surface()
reading = m.reading_form()

# Get the unconjugated reading
if dform != surf:
morphs2 = tokenizer_obj.tokenize(dform, mode)
if len(morphs2) == 1 and morphs2[0].dictionary_form() == dform:
reading = morphs2[0].reading_form()

return (
m.normalized_form(),
dform,
surf,
reading,
m.part_of_speech()[0],
m.part_of_speech()[1],
)

result = [proc_morph(m) for m in tokenizer_obj.tokenize(line, mode) if alpha_num.search(m.surface()) is None]
except:
result = []
#print(result)
print(json.dumps(result))
sys.stdout.flush()

if __name__ == '__main__':
if sys.argv[1] == 'parsers':
dump_parsers()
elif sys.argv[1] == 'interact':
interactive_mode()
76 changes: 75 additions & 1 deletion morph/morphemizer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# -*- coding: utf-8 -*-
import json
import re
import os
import subprocess

from .morphemes import Morpheme
from .deps.zhon.hanzi import characters
from .mecab_wrapper import getMorphemesMecab, getMecabIdentity
from .deps.jieba import posseg
from .util_external import memoize


####################################################################################################
Expand Down Expand Up @@ -35,9 +39,25 @@ def getName(self):
# Morphemizer Helpers
####################################################################################################

@memoize
def getAllMorphemizers():
# type: () -> [Morphemizer]
return [SpaceMorphemizer(), MecabMorphemizer(), JiebaMorphemizer(), CjkCharMorphemizer()]

morphemizers = [SpaceMorphemizer(), MecabMorphemizer(), JiebaMorphemizer(), CjkCharMorphemizer()]

try:
my_dir = os.path.dirname(__file__)
helper_path = os.path.join(my_dir, 'deps', 'python', 'parser_helper.py')
output = subprocess.check_output(['python', helper_path, 'parsers'])
parsers = json.loads(output.decode('utf-8'))
print('found parsers:', parsers)
for parser in parsers:
morphemizers.append(PythonMorphemizer(parser[0], parser[1]))

except:
pass

return morphemizers


def getMorphemizerByName(name):
Expand Down Expand Up @@ -70,6 +90,60 @@ def getMorphemesFromExpr(self, expression):
def getDescription(self):
return 'Japanese ' + getMecabIdentity()

####################################################################################################
# Python Morphemizer
####################################################################################################

POS_BLACKLIST = [
'記号', # "symbol", generally punctuation
'補助記号', # "symbol", generally punctuation
'空白', # Empty space
'SPACE',
'PUNCT',
'NUM'
]
SUBPOS_BLACKLIST = [
'数詞', # Numbers
]

class PythonMorphemizer(Morphemizer):
"""
Uses morphemizers in the current Python installation, including: SpaCy.
"""

def __init__(self, parser_name, description):
self.parser_name = parser_name
self.description = description
self.proc = None

@memoize
def getMorphemesFromExpr(self, expression):
if not self.proc:
my_dir = os.path.dirname(__file__)
helper_path = os.path.join(my_dir, 'deps', 'python', 'parser_helper.py')
cmd = ['python', helper_path, 'interact', self.parser_name]
self.proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)

expr = expression.encode('utf-8', 'ignore')
#print("expr", expr)
self.proc.stdin.write(expr + b'\n')
self.proc.stdin.flush()

res = []
for l in expr.split(b'\n'):
morphs = json.loads(self.proc.stdout.readline())
res.extend([Morpheme(m[0], m[1], m[2], m[3], m[4], m[5]) \
for m in morphs if (m[4] not in POS_BLACKLIST) and (m[5] not in SUBPOS_BLACKLIST)])
#print("res", res)
return res

def getName(self):
return self.parser_name

def getDescription(self):
return self.description


####################################################################################################
# Space Morphemizer
Expand Down