-
Notifications
You must be signed in to change notification settings - Fork 48
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Classical Chinese parser and tests.
- Loading branch information
Showing
5 changed files
with
159 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
""" | ||
Parsing for single-character languages. | ||
The parser uses some Language settings (e.g., word characters) to | ||
perform the actual parsing. | ||
Includes classes: | ||
- ClassicalChineseParser | ||
""" | ||
|
||
import re | ||
from typing import List | ||
from lute.parse.base import ParsedToken, AbstractParser | ||
|
||
|
||
class ClassicalChineseParser(AbstractParser): | ||
""" | ||
A general parser for space-delimited languages, | ||
such as English, French, Spanish ... etc. | ||
""" | ||
|
||
@classmethod | ||
def name(cls): | ||
return "Classical Chinese" | ||
|
||
def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]: | ||
""" | ||
Returns ParsedToken array for given language. | ||
""" | ||
text = re.sub(r'[ \t]+', '', text) | ||
|
||
replacements = language.character_substitutions.split("|") | ||
for replacement in replacements: | ||
fromto = replacement.strip().split("=") | ||
if len(fromto) >= 2: | ||
rfrom = fromto[0].strip() | ||
rto = fromto[1].strip() | ||
text = text.replace(rfrom, rto) | ||
|
||
text = text.replace("\r\n", "\n") | ||
text = text.replace('{', '[') | ||
text = text.replace('}', ']') | ||
text = text.replace("\n", '¶') | ||
text = text.strip() | ||
|
||
tokens = [] | ||
pattern = f'[{language.word_characters}]' | ||
for char in text: | ||
is_word_char = re.match(pattern, char) is not None | ||
is_end_of_sentence = char in language.regexp_split_sentences | ||
if char == '¶': | ||
is_end_of_sentence = True | ||
p = ParsedToken(char, is_word_char, is_end_of_sentence) | ||
tokens.append(p) | ||
|
||
return tokens |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
""" | ||
JapaneseParser tests. | ||
""" | ||
|
||
from lute.parse.base import ParsedToken | ||
|
||
|
||
def assert_tokens_equals(text, lang, expected): | ||
""" | ||
Parsing a text using a language should give the expected parsed tokens. | ||
expected is given as array of: | ||
[ original_text, is_word, is_end_of_sentence ] | ||
""" | ||
p = lang.parser | ||
print("passing text:") | ||
print(text) | ||
actual = p.get_parsed_tokens(text, lang) | ||
expected = [ParsedToken(*a) for a in expected] | ||
assert [ str(a) for a in actual ] == [ str(e) for e in expected ] | ||
|
||
|
||
|
||
def test_sample_1(classical_chinese): | ||
"Sample text parsed." | ||
s = "學而時習之,不亦說乎?" | ||
|
||
expected = [ | ||
[ '學', True ], | ||
[ '而', True ], | ||
[ '時', True ], | ||
[ '習', True ], | ||
[ '之', True ], | ||
[ ',', False ], | ||
[ '不', True ], | ||
[ '亦', True ], | ||
[ '說', True ], | ||
[ '乎', True ], | ||
[ "?", False, True ] | ||
] | ||
assert_tokens_equals(s, classical_chinese, expected) | ||
|
||
|
||
def test_sample_2(classical_chinese): | ||
"Sample text parsed, spaces removed." | ||
s = """學 而時習 之,不亦說乎? | ||
有朋 自遠方來,不亦樂乎?""" | ||
|
||
expected = [ | ||
[ '學', True ], | ||
[ '而', True ], | ||
[ '時', True ], | ||
[ '習', True ], | ||
[ '之', True ], | ||
[ ',', False ], | ||
[ '不', True ], | ||
[ '亦', True ], | ||
[ '說', True ], | ||
[ '乎', True ], | ||
[ "?", False, True ], | ||
[ "¶", False, True ], | ||
[ "有", True ], | ||
[ "朋", True ], | ||
[ "自", True ], | ||
[ "遠", True ], | ||
[ "方", True ], | ||
[ "來", True ], | ||
[ ",", False ], | ||
[ "不", True ], | ||
[ "亦", True ], | ||
[ "樂", True ], | ||
[ "乎", True ], | ||
[ "?", False, True ], | ||
] | ||
assert_tokens_equals(s, classical_chinese, expected) |