-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
83 lines (72 loc) · 3.24 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
from consts import calculate_priority
class Token:
def __init__(self, type, value):
self.type = type
self.value = value
def __repr__(self):
return f"Token({self.type}, {repr(self.value)})"
class Tokenizer:
def __init__(self, text):
self.text = text.upper()
self.position = 0
self.tokens = []
self.token_specification = [
('PHRASE', r'"[^"]+"'), # Phrase enclosed in double quotes
('AND', r'\bAND\b'), # AND operator
('OR', r'\bOR\b'), # OR operator
('NOT', r'\bNOT\b'), # NOT operator
('TERM', r'[^\s()]+'), # Term (any non-whitespace, non-parenthesis sequence)
('SKIP', r'\s+'), # Skip over spaces and tabs
('PREFIX', r'\*'), # Prefix operator
('MISMATCH', r'.'), # Any other character
]
self.token_regex = re.compile('|'.join(f'(?P<{pair[0]}>{pair[1]})' for pair in self.token_specification))
def tokenize(self):
previous_token = None
for match in self.token_regex.finditer(self.text):
kind = match.lastgroup
value = match.group(kind)
if kind == 'SKIP':
continue
elif kind == 'MISMATCH':
raise RuntimeError(f'Unexpected character: {value}')
else:
current_token = Token(kind, value)
if previous_token and previous_token.type in ('AND', 'OR', 'NOT') and current_token.type in ('AND', 'OR', 'NOT'):
raise RuntimeError(f'Missing term between operators: {previous_token.value} and {current_token.value}')
if current_token.type == 'PHRASE':
phrase_terms = value.strip('"').split()
for i, term in enumerate(phrase_terms):
self.tokens.append(Token('PHRASE', term))
if i < len(phrase_terms) - 1:
self.tokens.append(Token('AND', 'AND'))
else:
if previous_token and previous_token.type == 'TERM' and current_token.type == 'TERM':
self.tokens.append(Token('OR', 'OR'))
self.tokens.append(current_token)
previous_token = current_token
return self.divide()
def divide(self):
terms = []
string = ""
current_terms = []
for token in self.tokens:
if token.type == 'TERM' or token.type == 'PHRASE':
term = token.value.lower()
if term.endswith('*'):
term = term[:-1]
terms.append((term, 'PREFIX'))
elif token.type == 'PHRASE':
terms.append((term, 'PHRASE'))
else:
terms.append((term, []))
else:
if current_terms:
terms.append((' '.join(current_terms), []))
current_terms = []
priority = str(calculate_priority(token.type))
string += priority
if current_terms:
terms.append((' '.join(current_terms), []))
return terms, string