Skip to content

Commit 4c9fb36

Browse files
committed
Add Vietnamese support using pyvi
1 parent d938043 commit 4c9fb36

File tree

4 files changed

+87
-2
lines changed

4 files changed

+87
-2
lines changed

Diff for: README.md

+11
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,17 @@ MorphMan supports the following languages:
3030
- **Japanese**: You must additionally install the _[Japanese Support](https://ankiweb.net/shared/info/3918629684)_ Anki addon
3131
- **Chinese**: For Anki 2.0, please use [Jieba-Morph](https://github.com/NinKenDo64/Jieba-Morph). Chinese is included in Morphman for Anki 2.1
3232
- **CJK Characters**: Morphemizer that splits sentence into characters and filters for Chinese-Japanese-Korean logographic/idiographic characters.
33+
- **Vietnamese**: You must run Anki from source and install [pyvi](https://github.com/trungtv/pyvi) into its virtualenv:
34+
- `git clone https://github.com/ankitects/anki.git`
35+
- `cd anki`
36+
- `make develop`
37+
- Make sure you have the dependencies listed in anki/README.development
38+
- `source pyenv/bin/activate`
39+
- pip install pyvi
40+
41+
Then run Anki
42+
- `./run`
43+
3344
- more languages can be added on request if morpheme-splitting-tools are available for it
3445

3546
See Matt VS Japan's [video tutorial](https://www.youtube.com/watch?v=dVReg8_XnyA)

Diff for: morph/cli.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from collections import Counter
88

99
from .morphemes import MorphDb
10-
from .morphemizer import SpaceMorphemizer, MecabMorphemizer, CjkCharMorphemizer, JiebaMorphemizer
10+
from .morphemizer import SpaceMorphemizer, MecabMorphemizer, CjkCharMorphemizer, JiebaMorphemizer, VietnameseMorphemizer
1111

1212

1313
# hack: typing is compile time anyway, so, nothing bad happens if it fails, the try is to support anki < 2.1.16
@@ -88,6 +88,7 @@ def db_path(db_name):
8888
'mecab': MecabMorphemizer(),
8989
'cjkchar': CjkCharMorphemizer(),
9090
'jieba': JiebaMorphemizer(),
91+
'vietnamese': VietnameseMorphemizer(),
9192
}
9293

9394

Diff for: morph/morphemizer.py

+40-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from .deps.zhon.hanzi import characters
66
from .mecab_wrapper import getMorphemesMecab, getMecabIdentity
77
from .deps.jieba import posseg
8+
import importlib.util
89

910

1011
####################################################################################################
@@ -30,14 +31,23 @@ def getName(self):
3031
# type: () -> str
3132
return self.__class__.__name__
3233

34+
def exists(self):
35+
# type: () -> Boolean
36+
return True
37+
3338

3439
####################################################################################################
3540
# Morphemizer Helpers
3641
####################################################################################################
3742

3843
def getAllMorphemizers():
3944
# type: () -> [Morphemizer]
40-
return [SpaceMorphemizer(), MecabMorphemizer(), JiebaMorphemizer(), CjkCharMorphemizer()]
45+
morphemizers = [SpaceMorphemizer(), MecabMorphemizer(), JiebaMorphemizer(), VietnameseMorphemizer(), CjkCharMorphemizer()]
46+
for m in morphemizers:
47+
if not m.exists():
48+
morphemizers.remove(m)
49+
50+
return morphemizers
4151

4252

4353
def getMorphemizerByName(name):
@@ -90,6 +100,35 @@ def getDescription(self):
90100
return 'Language w/ Spaces'
91101

92102

103+
####################################################################################################
104+
# Vietnamese Morphemizer
105+
####################################################################################################
106+
107+
class VietnameseMorphemizer(Morphemizer):
108+
"""
109+
Vietnamese contains many compound words where the polysyllabic morphemes
110+
are divided by spaces, so an extra tool - pyvi - is used instead.
111+
"""
112+
def exists(self):
113+
"""
114+
pyvi has large dependencies. To avoid bundling it or forcing users to
115+
install it as a dependency, the Vietnamese morphizer only appears if
116+
pyvi is importable.
117+
"""
118+
return (importlib.util.find_spec('pyvi') is not None)
119+
120+
def getMorphemesFromExpr(self, expression):
121+
from pyvi import ViTokenizer
122+
tokens = SpaceMorphemizer.getMorphemesFromExpr(self, ViTokenizer.tokenize(expression))
123+
for word in tokens:
124+
word.base = word.base.replace('_', ' ')
125+
126+
return tokens
127+
128+
def getDescription(self):
129+
return 'Vietnamese'
130+
131+
93132
####################################################################################################
94133
# CJK Character Morphemizer
95134
####################################################################################################

Diff for: test/test_vietnamese_morphemizer.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from morph.morphemizer import getMorphemizerByName
2+
import unittest
3+
4+
class TestVietnameseMorphemizer(unittest.TestCase):
5+
def setUp(self):
6+
self.morphemizer = getMorphemizerByName("VietnameseMorphemizer")
7+
8+
def test_morpheme_generation(self):
9+
if self.morphemizer is not None:
10+
sentence_1 = ("Trăm năm trong cõi người ta,"
11+
" Chữ tài chữ mệnh khéo là ghét nhau."
12+
" Trải qua một cuộc bể dâu,"
13+
" Những điều trông thấy mà đau đớn lòng.")
14+
15+
case_1 = ["trăm năm", "trong", "cõi", "người ta", "chữ", "tài", "chữ", "mệnh",
16+
"khéo", "là", "ghét", "nhau", "trải", "qua", "một", "cuộc", "bể dâu",
17+
"những", "điều", "trông", "thấy", "mà", "đau đớn", "lòng"]
18+
19+
sentence_2 = "Mặt Trời"
20+
21+
case_2 = ["mặt trời"]
22+
23+
for idx, m in enumerate(self.morphemizer.getMorphemesFromExpr(sentence_1)):
24+
self.assertEqual(m.base, case_1[idx])
25+
26+
for idx, m in enumerate(self.morphemizer.getMorphemesFromExpr(sentence_2)):
27+
self.assertEqual(m.base, case_2[idx])
28+
29+
else:
30+
print('\npyvi is not installed, skipping Vietnamese tests')
31+
32+
33+
if __name__ == '__main__':
34+
unittest.main()

0 commit comments

Comments
 (0)