Skip to content

Commit

Permalink
Add Classical Chinese parser and tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
jzohrab committed Oct 23, 2023
1 parent 19b4726 commit 9a526b6
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 2 deletions.
58 changes: 58 additions & 0 deletions lute/parse/character_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
Parsing for single-character languages.
The parser uses some Language settings (e.g., word characters) to
perform the actual parsing.
Includes classes:
- ClassicalChineseParser
"""

import re
from typing import List
from lute.parse.base import ParsedToken, AbstractParser


class ClassicalChineseParser(AbstractParser):
"""
A general parser for space-delimited languages,
such as English, French, Spanish ... etc.
"""

@classmethod
def name(cls):
return "Classical Chinese"

def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
"""
Returns ParsedToken array for given language.
"""
text = re.sub(r'[ \t]+', '', text)

replacements = language.character_substitutions.split("|")
for replacement in replacements:
fromto = replacement.strip().split("=")
if len(fromto) >= 2:
rfrom = fromto[0].strip()
rto = fromto[1].strip()
text = text.replace(rfrom, rto)

text = text.replace("\r\n", "\n")
text = text.replace('{', '[')
text = text.replace('}', ']')
text = text.replace("\n", '¶')
text = text.strip()

tokens = []
pattern = f'[{language.word_characters}]'
for char in text:
is_word_char = re.match(pattern, char) is not None
is_end_of_sentence = char in language.regexp_split_sentences
if char == '¶':
is_end_of_sentence = True
p = ParsedToken(char, is_word_char, is_end_of_sentence)
tokens.append(p)

return tokens
4 changes: 3 additions & 1 deletion lute/parse/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
from lute.parse.base import AbstractParser
from lute.parse.space_delimited_parser import SpaceDelimitedParser, TurkishParser
from lute.parse.mecab_parser import JapaneseParser
from lute.parse.character_parser import ClassicalChineseParser


parsers = {
'spacedel': SpaceDelimitedParser,
'turkish': TurkishParser,
'japanese': JapaneseParser
'japanese': JapaneseParser,
'classicalchinese': ClassicalChineseParser
}


Expand Down
12 changes: 11 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,13 @@ def _get_language(f):
def fixture_test_languages(app_context, demo_yaml_folder):
"Dict of available languages for tests."
# Hardcoded = good enough.
langs = [ 'spanish', 'english', 'japanese', 'turkish' ]
langs = [
'spanish',
'english',
'japanese',
'turkish',
'classical_chinese'
]
ret = {}
for lang in langs:
f = os.path.join(demo_yaml_folder, f'{lang}.yaml')
Expand All @@ -153,3 +159,7 @@ def fixture_japanese(test_languages):
@pytest.fixture(name="turkish")
def fixture_turkish(test_languages):
return test_languages['turkish']

@pytest.fixture(name="classical_chinese")
def fixture_cl_chinese(test_languages):
return test_languages['classical_chinese']
12 changes: 12 additions & 0 deletions tests/features/rendering.feature
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,18 @@ Feature: Rendering
している(1)


Scenario: Bug fix: classical chinese first character
Given language Classical Chinese
And text:
關關。
Then rendered should be:
關/關/。
Given terms:
Then rendered should be:
關(1)/關(1)/。


# Template
# Scenario: x
# Given language x
Expand Down
75 changes: 75 additions & 0 deletions tests/unit/parse/test_ClassicalChineseParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""
JapaneseParser tests.
"""

from lute.parse.base import ParsedToken


def assert_tokens_equals(text, lang, expected):
"""
Parsing a text using a language should give the expected parsed tokens.
expected is given as array of:
[ original_text, is_word, is_end_of_sentence ]
"""
p = lang.parser
print("passing text:")
print(text)
actual = p.get_parsed_tokens(text, lang)
expected = [ParsedToken(*a) for a in expected]
assert [ str(a) for a in actual ] == [ str(e) for e in expected ]



def test_sample_1(classical_chinese):
"Sample text parsed."
s = "學而時習之,不亦說乎?"

expected = [
[ '學', True ],
[ '而', True ],
[ '時', True ],
[ '習', True ],
[ '之', True ],
[ ',', False ],
[ '不', True ],
[ '亦', True ],
[ '說', True ],
[ '乎', True ],
[ "?", False, True ]
]
assert_tokens_equals(s, classical_chinese, expected)


def test_sample_2(classical_chinese):
"Sample text parsed, spaces removed."
s = """學 而時習 之,不亦說乎?
有朋 自遠方來,不亦樂乎?"""

expected = [
[ '學', True ],
[ '而', True ],
[ '時', True ],
[ '習', True ],
[ '之', True ],
[ ',', False ],
[ '不', True ],
[ '亦', True ],
[ '說', True ],
[ '乎', True ],
[ "?", False, True ],
[ "¶", False, True ],
[ "有", True ],
[ "朋", True ],
[ "自", True ],
[ "遠", True ],
[ "方", True ],
[ "來", True ],
[ ",", False ],
[ "不", True ],
[ "亦", True ],
[ "樂", True ],
[ "乎", True ],
[ "?", False, True ],
]
assert_tokens_equals(s, classical_chinese, expected)

0 comments on commit 9a526b6

Please sign in to comment.