kaegi · rteabeault · Jan 11, 2021 · Jan 11, 2021
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@
 .vscode
 .DS_Store
 meta.json
-.idea
+.idea
+*.iml
diff --git a/README.md b/README.md
@@ -30,12 +30,110 @@ MorphMan supports the following languages:
 - **Japanese**: You must additionally install the _[Japanese Support](https://ankiweb.net/shared/info/3918629684)_ Anki addon
 - **Chinese**: For Anki 2.0, please use [Jieba-Morph](https://github.com/NinKenDo64/Jieba-Morph). Chinese is included in Morphman for Anki 2.1
 - **CJK Characters**: Morphemizer that splits sentence into characters and filters for Chinese-Japanese-Korean logographic/idiographic characters.
+- **spaCy**: [SpaCy](https://spacy.io/) is a free open-source library for Natural Language Processing in Python. See section below for more information. 
 - more languages can be added on request if morpheme-splitting-tools are available for it
 
 See Matt VS Japan's [video tutorial](https://www.youtube.com/watch?v=dVReg8_XnyA)
 and accompanying [blog post](https://massimmersionapproach.com/table-of-contents/anki/morphman).
 See the [MorphMan wiki](https://github.com/kaegi/MorphMan/wiki) for more information.
 
+## spaCy
+[SpaCy](https://spacy.io/) is a free open-source library for Natural Language Processing in Python.
+Machine learning models for a variety of languages are available, including Chinese, Danish, Dutch,
+English, French, German, Greek, Italian, Japanese, Lithuanian, Norwegian Bokmål, Polish,
+Portuguese, Romanian, and Spanish. Additionally, spaCy provides the tools to train additional
+language models if you desire.
+
+### Requirements
+* Current installation of python 3. (Currently tested on python 3.8.5)
+* spaCy installed
+* One or more desired language models installed and linked.
+
+### Installation
+
+1. Install python if it is not already. See the 
+[python download page](https://www.python.org/downloads/) for more information.
+
+2. Determine the path to you python executable and add it to `config.py`.
+    On Unix/MacOs this can be done with the `which` command using a terminal
+    ```
+    > which python
+    > /Users/someperson/workspace/spacy_test/venv/bin/python
+    ```
+
+    For Windows you can usually find with the `where` command using the command prompt. 
+    ```
+    C:\>where python
+    C:\Users\someperson\AppData\Local\Microsoft\WindowsApps\python.exe
+    ```
+
+   Once you have that open config.py in your MorphMan installation and set 
+   `path_python` to the path value.
+
+   Change
+   ```
+   'path_python': None     
+   ```
+   to your path
+   ```
+   'path_python': '/Users/someperson/somepython/bin/python',
+   ```
+
+3. Install spaCy.
+
+    Unix/MacOs
+    ```
+    python -m pip install spacy
+    ```
+
+    Windows
+    ```
+    py -m pip install spacy
+    ```
+
+   For more information installing spaCy see the 
+   [installation instructions](https://spacy.io/usage).
+
+4. Install and link the desired spaCy models.
+   You must install the spaCy model and then make sure it is linked. For example, if you wanted 
+   to use the German model `de_core_news_sm`. You would do the following.
+   ```
+   python -m spacy download de_core_news_sm
+   python -m spacy link de_core_news_sm <link_name>
+   ```
+
+   "spaCy - `link_name`" is the name that will show up in MorphMan when selecting a morphemizer. 
+   For example, if we did 
+   ```
+   python -m spacy link de_core_news_sm de
+   ```
+
+   then we should see "spaCy - de" as a morphemizer option.
+
+    You can verify what models are installed for spaCy
+    ```
+    python -m spacy info
+
+    ============================== Info about spaCy ==============================
+
+    spaCy version    2.3.5
+    Location         /Users/someperson/python3.8/site-packages/spacy
+    Platform         macOS-10.15.7-x86_64-i386-64bit
+    Python version   3.8.5
+    Models           ja, de
+    ```
+
+   Here you can see two models have been installed and linked, `ja` and `de`.
+
+   For more information installing spaCy models see the 
+   [installation instructions](https://spacy.io/usage/models).
+
+### Debugging
+If you find you are having issues getting MoprhMan to recognize your installed models there may
+be valuable log output in morphman log file. By default this log file should be in the root of your
+Anki profile directory and called `morphman.log`. Please use the output of this log file when 
+opening any issues.
+
 # Development
 - Set up local environment:
   - The best is to use a Python virtual environment and install prebuilt Anki wheels:

diff --git a/__init__.py b/__init__.py
@@ -90,6 +90,7 @@ def main():
     from .morph.browser import alreadyKnownTagger
     from .morph import newMorphHelper
     from .morph import stats
+    from .morph import morphemizer_registry
 
 
     anki.stats.CollectionStats.easeGraph = \

diff --git a/morph/UI/morphemizerComboBox.py b/morph/UI/morphemizerComboBox.py
@@ -1,31 +1,34 @@
-
+from PyQt5.QtCore import Qt
 from PyQt5.QtWidgets import QComboBox
 
-
 class MorphemizerComboBox(QComboBox):
+    name_role = Qt.UserRole + 1
+    morphemizer_role = Qt.UserRole + 2
 
-    def setMorphemizers(self, morphemizers):
-        if type(morphemizers) == list:
-            self.morphemizers = morphemizers
-        else:
-            self.morphemizers = []
+    def __init__(self, morphemizerRegistry=None, parent=None):
+        super(MorphemizerComboBox, self).__init__(parent)
 
-        for morphemizer in self.morphemizers:
-            self.addItem(morphemizer.getDescription())
+        if morphemizerRegistry:
+            self.setMorphemizerRegistry(morphemizerRegistry)
 
         self.setCurrentIndex(0)
 
+    def setMorphemizerRegistry(self, morphemizerRegistry):
+        morphemizerRegistry.morphemizer_added.connect(self._add_morphemizer)
+        morphemizerRegistry.morphemizer_removed.connect(self._remove_morphemizer)
+
+        for morphemizer in morphemizerRegistry.getMorphemizers():
+            self._add_morphemizer(morphemizer)
+
     def getCurrent(self):
-        try:
-            return self.morphemizers[self.currentIndex()]
-        except IndexError:
-            return None
+        return self.currentData()
 
     def setCurrentByName(self, name):
-        active = False
-        for i, morphemizer in enumerate(self.morphemizers):
-            if morphemizer.getName() == name:
-                active = i
-        if active:
-            self.setCurrentIndex(active)
+        self.setCurrentIndex(self.findData(name, role=self.name_role))
+
+    def _add_morphemizer(self, morphemizer):
+        self.addItem(morphemizer.getDescription(), morphemizer)
+        self.setItemData(self.findData(morphemizer), morphemizer.getName(), self.name_role)
 
+    def _remove_morphemizer(self, morphemizer):
+        self.removeItem(self.findText(morphemizer.getDescription()))
diff --git a/morph/browser/extractMorphemes.py b/morph/browser/extractMorphemes.py
@@ -3,7 +3,6 @@
 from anki.hooks import addHook
 from anki.utils import stripHTML
 from ..morphemes import AnkiDeck, MorphDb, getMorphemes
-from ..morphemizer import getMorphemizerByName
 from ..util import addBrowserNoteSelectionCmd, mw, getFilter, infoMsg, QFileDialog, runOnce
 from ..preferences import get_preference as cfg
 
@@ -21,7 +20,7 @@ def per(st, n):
     if note_cfg is None:
         return st
 
-    morphemizer = getMorphemizerByName(note_cfg['Morphemizer'])
+    morphemizer = mw.morphemizerRegistry.getMorphemizer(note_cfg['Morphemizer'])
     for f in note_cfg['Fields']:
         ms = getMorphemes(morphemizer, stripHTML(n[f]), n.tags)
         loc = AnkiDeck(n.id, f, n[f], n.guid, mats)

diff --git a/morph/browser/massTagger.py b/morph/browser/massTagger.py
@@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-
+from aqt import mw
 from aqt.utils import tooltip
 from anki.hooks import addHook
 from anki.utils import stripHTML
 from ..morphemes import getMorphemes, MorphDb
-from ..morphemizer import getMorphemizerByName
 from ..util import addBrowserNoteSelectionCmd, getFilter, infoMsg, QInputDialog, QFileDialog, QLineEdit, runOnce
 from ..preferences import get_preference as cfg
 from anki.lang import _
@@ -31,7 +31,7 @@ def per(st, n):  # :: State -> Note -> State
     note_cfg = getFilter(n)
     if note_cfg is None:
         return st
-    morphemizer = getMorphemizerByName(note_cfg['Morphemizer'])
+    morphemizer = mw.morphemizerRegistry.getMorphemizer(note_cfg['Morphemizer'])
     for field in note_cfg['Fields']:
         for m in getMorphemes(morphemizer, stripHTML(n[field]), n.tags):
             if m in st['db'].db:

diff --git a/morph/browser/viewMorphemes.py b/morph/browser/viewMorphemes.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 from anki.hooks import addHook
 from anki.utils import stripHTML
+from aqt import mw
+
 from ..morphemes import getMorphemes, ms2str
-from ..morphemizer import getMorphemizerByName
 from ..util import addBrowserNoteSelectionCmd, getFilter, infoMsg, runOnce
 from ..preferences import get_preference as cfg
 
@@ -15,7 +16,7 @@ def per(st, n):
     if notecfg is None:
         return st
 
-    morphemizer = getMorphemizerByName(notecfg['Morphemizer'])
+    morphemizer = mw.morphemizerRegistry.getMorphemizer(notecfg['Morphemizer'])
     for f in notecfg['Fields']:
         ms = getMorphemes(morphemizer, stripHTML(n[f]), n.tags)
         st['morphemes'] += ms

diff --git a/morph/config.py b/morph/config.py
@@ -14,6 +14,7 @@
     'path_seen': os.path.join(mw.pm.profileFolder(), 'dbs', 'seen.db'),
     'path_log': os.path.join(mw.pm.profileFolder(), 'morphman.log'),
     'path_stats': os.path.join(mw.pm.profileFolder(), 'morphman.stats'),
+    'path_python': None,
 
     # change the thresholds for various stages of maturity, in days
     'threshold_mature': 21,         # 21 days is what Anki uses

diff --git a/morph/deps/mecab/mecab b/morph/deps/mecab/mecab
diff --git a/morph/deps/spacy/__init__.py b/morph/deps/spacy/__init__.py
@@ -0,0 +1,54 @@
+import re
+import subprocess
+
+from .morphemizer import SpacyMorphemizer
+from ...preferences import get_preference
+from ...subprocess_util import platform_subprocess_args
+from ...util import printf
+
+
+def init_spacy(morphemizerRegistry):
+    printf("Initializing Spacy!")
+
+    python_path = get_preference('path_python')
+
+    if python_path:
+        models = _spacy_models(python_path)
+        if models:
+            for model in models:
+                printf(f"Creating morphemizer for spacy model {model}.")
+                register_morphemizer(morphemizerRegistry, model)
+        else:
+            printf("No models were installed for spaCy.")
+    else:
+        printf('Python path not specified in config.py.')
+
+
+def _parse_morphemizers(info_command_result):
+    m = re.search('^Models\\s+(.*)$', info_command_result, re.MULTILINE)
+    return [x.strip() for x in m.group(1).split(',')]
+
+
+def _spacy_models(python_path):
+    cmd = [python_path, '-m', 'spacy', 'info']
+    printf(f"Collecting spacy model info: {cmd}")
+
+    result = subprocess.run(
+        [python_path, '-m', 'spacy', 'info'],
+        capture_output=True,
+        **platform_subprocess_args())
+
+    if result.returncode != 0:
+        printf('Command to find spaCy models failed. Please ensure python is installed at the path '
+               'given in config.py under the "path_python" key and spaCy is installed in that '
+               'python installation')
+        return None
+    else:
+        printf(result)
+        output = result.stdout.decode('utf-8')
+        printf(f"spaCy info returned the following: {output}")
+        return _parse_morphemizers(output)
+
+
+def register_morphemizer(morphemizerRegistry, model):
+    morphemizerRegistry.addMorphemizer(SpacyMorphemizer(model))
diff --git a/morph/deps/spacy/extract_morphemes.py b/morph/deps/spacy/extract_morphemes.py
@@ -0,0 +1,53 @@
+import argparse
+import json
+import sys
+
+import spacy
+
+
+POS_BLACKLIST = [
+    'SPACE',
+    'PUNCT',
+    'NUM',
+]
+
+
+def process_input(model):
+    nlp = spacy.load(model)
+    for line in sys.stdin:
+        doc = nlp(line)
+        result = list(map(lambda t: _createMorpheme(t, doc), filter(_filter_tokens, doc)))
+        print(json.dumps(result))
+        sys.stdout.flush()
+
+
+def _createMorpheme(token, doc):
+    reading = token.lemma_
+    if "reading_forms" in doc.user_data:
+        reading_forms = doc.user_data["reading_forms"]
+        if token.i < len(reading_forms):
+            reading = "" if reading_forms[token.i] is None else reading_forms[token.i]
+
+    return {
+        'norm': token.lemma_,
+        'base': token.norm_,
+        'inflected': token.text,
+        'read': reading,
+        'pos': token.pos_,
+        'subPos': "*"
+    }
+
+
+def _filter_tokens(token):
+    return not token.pos_ in POS_BLACKLIST
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="""Listens on stdin and process given text with spaCy. After starting pass in
+    line delimited text to process with spaCy using the given model. The output is a json
+    object containing norm, base, inflected, read, pos, and subpos."""
+    )
+    parser.add_argument('--model', required=True, help="Model to use for processing text. ")
+    args = parser.parse_args()
+    process_input(args.model)
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,4 +2,5 @@ @@
     .vscode
     .DS_Store
     meta.json
-    .idea
+    .idea
+    *.iml