Skip to content

Commit

Permalink
Japanese parser tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
jzohrab committed Oct 23, 2023
1 parent f0ddaa1 commit d2418c1
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 19 deletions.
55 changes: 37 additions & 18 deletions lute/parse/mecab_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,24 +54,33 @@ def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
text = re.sub(r'[ \t]+', ' ', text).strip()

lines = []
flags = r'-F %m\t%t\t%h\n -U %m\t%t\t%h\n -E EOP\t3\t7\n'
with MeCab(flags) as nm:
for n in nm.parse(text, as_nodes=True):
lines.append(n.feature)

lines = [n for n in lines if n is not None]
lines = [n.strip() for n in lines if n.strip() != '']

tokens = []
for lin in lines:
# If the string contains a "\n", MeCab appears to silently
# remove it. Splitting it works (ref test_JapaneseParser).
# Flags: ref https://github.com/buruzaemon/natto-py:
# -F = node format
# -U = unknown format
# -E = EOP format
with MeCab(r'-F %m\t%t\t%h\n -U %m\t%t\t%h\n -E EOP\t3\t7\n') as nm:
for para in text.split("\n"):
for n in nm.parse(para, as_nodes=True):
lines.append(n.feature)

lines = [
n.strip() for n in lines
if n is not None and n.strip() != ''
]

def line_to_token(lin):
"Convert parsed line to a ParsedToken."
term, node_type, third = lin.split("\t")
is_eos = term in language.regexp_split_sentences
if term == 'EOP' and third == '7':
term = '¶'
is_word = node_type in '2678'
p = ParsedToken(term, is_word, is_eos or term == '¶')
tokens.append(p)
return ParsedToken(term, is_word, is_eos or term == '¶')

tokens = [line_to_token(lin) for lin in lines]
return tokens


Expand All @@ -86,17 +95,27 @@ def _string_is_hiragana(self, s: str) -> bool:
return all(self._char_is_hiragana(c) for c in s)


def get_reading(self, text: str): # pylint: disable=unused-argument
def get_reading(self, text: str):
"""
Get the pronunciation for the given text. For most
languages, this can't be automated.
Get the pronunciation for the given text.
Returns None if the text is all hiragana, or the pronunciation
doesn't add value (same as text).
"""
if self._string_is_hiragana(text):
return None

reading = None
flags = r'-O yomi'
readings = []
with MeCab(flags) as nm:
for n in nm.parse('必ず', as_nodes=True):
reading = n.feature
return reading
for n in nm.parse(text, as_nodes=True):
readings.append(n.feature)
readings = [
r.strip() for r in readings
if r is not None and r.strip() != ''
]

ret = ''.join(readings).strip()
if ret in ('', text):
return None
return ret
9 changes: 8 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,13 @@ def fixture_spanish(app_context, demo_yaml_folder):

@pytest.fixture(name="english")
def fixture_english(app_context, demo_yaml_folder):
"Make spanish from demo file."
"Make jp from demo file."
f = os.path.join(demo_yaml_folder, 'english.yaml')
return _get_language(f)


@pytest.fixture(name="japanese")
def fixture_japanese(app_context, demo_yaml_folder):
"Make jp from demo file."
f = os.path.join(demo_yaml_folder, 'japanese.yaml')
return _get_language(f)
15 changes: 15 additions & 0 deletions tests/features/rendering.feature
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,21 @@ Feature: Rendering
ışık(1)/ /için(3)/ /Işık(1)/ /İçin(3)/.



Scenario: Japanese text.
Given language Japanese
And text:
私は元気です.
Then rendered should be:
私/は/元気/です/.
Given terms:
元気
です
Then rendered should be:
私(1)/は/元気(1)/です(1)/.


Scenario: Japanese multiword at end of sentence.
Given language Japanese
And text:
Expand Down
71 changes: 71 additions & 0 deletions tests/unit/parse/test_JapaneseParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""
JapaneseParser tests.
"""

from lute.parse.mecab_parser import JapaneseParser
from lute.parse.base import ParsedToken


def assert_tokens_equals(text, lang, expected):
"""
Parsing a text using a language should give the expected parsed tokens.
expected is given as array of:
[ original_text, is_word, is_end_of_sentence ]
"""
p = JapaneseParser()
actual = p.get_parsed_tokens(text, lang)
expected = [ParsedToken(*a) for a in expected]
assert [ str(a) for a in actual ] == [ str(e) for e in expected ]



def test_end_of_sentence_stored_in_parsed_tokens(japanese):
"ParsedToken is marked as EOS=True at ends of sentences."
s = "元気.元気?元気!\n元気。元気?元気!"

expected = [
( "元気", True ),
( ".", False, True ),
( "元気", True ),
( "?", False, True ),
( "元気", True ),
( "!", False, True ),
( "¶", False, True ),
( "元気", True ),
( "。", False, True ),
( "元気", True ),
( "?", False, True ),
( "元気", True ),
( "!", False, True ),
( "¶", False, True )
]
assert_tokens_equals(s, japanese, expected)


def test_readings():
"""
Parser returns readings if they add value.
"""
p = JapaneseParser()

# Don't bother giving reading for a few cases
no_reading = [
'NHK', # roman
'ツヨイ', # only katakana
'どちら' # only hiragana
]

for c in no_reading:
assert p.get_reading(c) is None, c

zws = '\u200B'
cases = [
('強い', 'ツヨイ'),
('二人', 'ニニン'), # ah well, not perfect :-)
('強いか', 'ツヨイカ'),
(f"強い{zws}か", f"ツヨイ{zws}カ"), # zero-width-space ignored
]

for c in cases:
assert p.get_reading(c[0]) == c[1], c[0]

0 comments on commit d2418c1

Please sign in to comment.