Japanese parser tests.

LuteOrg · Oct 23, 2023 · d2418c1 · d2418c1
1 parent f0ddaa1
commit d2418c1
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 19 deletions.
diff --git a/lute/parse/mecab_parser.py b/lute/parse/mecab_parser.py
@@ -54,24 +54,33 @@ def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
         text = re.sub(r'[ \t]+', ' ', text).strip()
 
         lines = []
-        flags = r'-F %m\t%t\t%h\n -U %m\t%t\t%h\n -E EOP\t3\t7\n'
-        with MeCab(flags) as nm:
-            for n in nm.parse(text, as_nodes=True):
-                lines.append(n.feature)
-
-        lines = [n for n in lines if n is not None]
-        lines = [n.strip() for n in lines if n.strip() != '']
 
-        tokens = []
-        for lin in lines:
+        # If the string contains a "\n", MeCab appears to silently
+        # remove it.  Splitting it works (ref test_JapaneseParser).
+        # Flags: ref https://github.com/buruzaemon/natto-py:
+        #    -F = node format
+        #    -U = unknown format
+        #    -E = EOP format
+        with MeCab(r'-F %m\t%t\t%h\n -U %m\t%t\t%h\n -E EOP\t3\t7\n') as nm:
+            for para in text.split("\n"):
+                for n in nm.parse(para, as_nodes=True):
+                    lines.append(n.feature)
+
+        lines = [
+            n.strip() for n in lines
+            if n is not None and n.strip() != ''
+        ]
+
+        def line_to_token(lin):
+            "Convert parsed line to a ParsedToken."
             term, node_type, third = lin.split("\t")
             is_eos = term in language.regexp_split_sentences
             if term == 'EOP' and third == '7':
                 term = '¶'
             is_word = node_type in '2678'
-            p = ParsedToken(term, is_word, is_eos or term == '¶')
-            tokens.append(p)
+            return ParsedToken(term, is_word, is_eos or term == '¶')
 
+        tokens = [line_to_token(lin) for lin in lines]
         return tokens
 
 
@@ -86,17 +95,27 @@ def _string_is_hiragana(self, s: str) -> bool:
         return all(self._char_is_hiragana(c) for c in s)
 
 
-    def get_reading(self, text: str): # pylint: disable=unused-argument
+    def get_reading(self, text: str):
         """
-        Get the pronunciation for the given text.  For most
-        languages, this can't be automated.
+        Get the pronunciation for the given text.
+
+        Returns None if the text is all hiragana, or the pronunciation
+        doesn't add value (same as text).
         """
         if self._string_is_hiragana(text):
             return None
 
-        reading = None
         flags = r'-O yomi'
+        readings = []
         with MeCab(flags) as nm:
-            for n in nm.parse('必ず', as_nodes=True):
-                reading = n.feature
-        return reading
+            for n in nm.parse(text, as_nodes=True):
+                readings.append(n.feature)
+        readings = [
+            r.strip() for r in readings
+            if r is not None and r.strip() != ''
+        ]
+
+        ret = ''.join(readings).strip()
+        if ret in ('', text):
+            return None
+        return ret
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -135,6 +135,13 @@ def fixture_spanish(app_context, demo_yaml_folder):
 
 @pytest.fixture(name="english")
 def fixture_english(app_context, demo_yaml_folder):
-    "Make spanish from demo file."
+    "Make jp from demo file."
     f = os.path.join(demo_yaml_folder, 'english.yaml')
     return _get_language(f)
+
+
+@pytest.fixture(name="japanese")
+def fixture_japanese(app_context, demo_yaml_folder):
+    "Make jp from demo file."
+    f = os.path.join(demo_yaml_folder, 'japanese.yaml')
+    return _get_language(f)
diff --git a/tests/features/rendering.feature b/tests/features/rendering.feature
@@ -402,6 +402,21 @@ Feature: Rendering
             ışık(1)/ /için(3)/ /Işık(1)/ /İçin(3)/.
 
 
+
+    Scenario: Japanese text.
+        Given language Japanese
+        And text:
+            私は元気です.
+        Then rendered should be:
+            私/は/元気/です/.
+        Given terms:
+            私
+            元気
+            です
+        Then rendered should be:
+            私(1)/は/元気(1)/です(1)/.
+
+
     Scenario: Japanese multiword at end of sentence.
         Given language Japanese
         And text:

diff --git a/tests/unit/parse/test_JapaneseParser.py b/tests/unit/parse/test_JapaneseParser.py
@@ -0,0 +1,71 @@
+"""
+JapaneseParser tests.
+"""
+
+from lute.parse.mecab_parser import JapaneseParser
+from lute.parse.base import ParsedToken
+
+
+def assert_tokens_equals(text, lang, expected):
+    """
+    Parsing a text using a language should give the expected parsed tokens.
+
+    expected is given as array of:
+    [ original_text, is_word, is_end_of_sentence ]
+    """
+    p = JapaneseParser()
+    actual = p.get_parsed_tokens(text, lang)
+    expected = [ParsedToken(*a) for a in expected]
+    assert [ str(a) for a in actual ] == [ str(e) for e in expected ]
+
+
+
+def test_end_of_sentence_stored_in_parsed_tokens(japanese):
+    "ParsedToken is marked as EOS=True at ends of sentences."
+    s = "元気.元気?元気!\n元気。元気？元気！"
+
+    expected = [
+        ( "元気", True ),
+        ( ".", False, True ),
+        ( "元気", True ),
+        ( "?", False, True ),
+        ( "元気", True ),
+        ( "!", False, True ),
+        ( "¶", False, True ),
+        ( "元気", True ),
+        ( "。", False, True ),
+        ( "元気", True ),
+        ( "？", False, True ),
+        ( "元気", True ),
+        ( "！", False, True ),
+        ( "¶", False, True )
+    ]
+    assert_tokens_equals(s, japanese, expected)
+
+
+def test_readings():
+    """
+    Parser returns readings if they add value.
+    """
+    p = JapaneseParser()
+
+    # Don't bother giving reading for a few cases
+    no_reading = [
+        'NHK',  # roman
+        'ツヨイ',  # only katakana
+        'どちら'  # only hiragana
+    ]
+
+    for c in no_reading:
+        assert p.get_reading(c) is None, c
+
+    zws = '\u200B'
+    cases = [
+        ('強い', 'ツヨイ'),
+        ('二人', 'ニニン'),  # ah well, not perfect :-)
+        ('強いか', 'ツヨイカ'),
+        (f"強い{zws}か", f"ツヨイ{zws}カ"),  # zero-width-space ignored
+    ]
+
+    for c in cases:
+        assert p.get_reading(c[0]) == c[1], c[0]