Skip to content

Commit

Permalink
Prioritize numbers next to currencies
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Jun 19, 2019
1 parent 4d9c393 commit bd856ed
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 22 deletions.
76 changes: 60 additions & 16 deletions price_parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import re
import string
from typing import Callable, Optional, Pattern, List, Tuple
from typing import Callable, Match, Optional, Pattern, List, Tuple
from decimal import Decimal, InvalidOperation

import attr
Expand Down Expand Up @@ -36,11 +36,17 @@ def fromstring(cls, price: Optional[str],
``price`` string, it could be **preferred** over a value extracted
from ``currency_hint`` string.
"""
amount_text = extract_price_text(price) if price is not None else None
currency_match, source = _extract_currency_symbol(price, currency_hint)
if price is not None:
_currency_match = currency_match if source == price else None
amount_text = extract_price_text(price, _currency_match)
else:
amount_text = None
amount_num = parse_number(amount_text) if amount_text is not None else None
currency = extract_currency_symbol(price, currency_hint)
if currency is not None:
currency = currency.strip()
if currency_match is not None:
currency = currency_match.group(0).strip()
else:
currency = None
return Price(
amount=amount_num,
currency=currency,
Expand Down Expand Up @@ -120,11 +126,11 @@ def or_regex(symbols: List[str]) -> Pattern:
_search_unsafe_currency = or_regex(OTHER_CURRENCY_SYMBOLS).search


def extract_currency_symbol(price: Optional[str],
currency_hint: Optional[str]) -> Optional[str]:
def _extract_currency_symbol(price: Optional[str], currency_hint: Optional[str]) -> Tuple[Optional[Match], Optional[str]]:
"""
Guess currency symbol from extracted price and currency strings.
Return an empty string if symbol is not found.
Guess the currency symbol from extracted price and currency strings.
Return a (`match object`_, source_string) tuple with the symbol found and
the string where it was found, or (None, None) if no symbol is found.
"""
methods: List[Tuple[Callable, Optional[str]]] = [
(_search_safe_currency, price),
Expand All @@ -142,17 +148,32 @@ def extract_currency_symbol(price: Optional[str],
for meth, attr in methods:
m = meth(attr) if attr else None
if m:
return m.group(0)
return m, attr

return None, None


def extract_currency_symbol(price: Optional[str],
currency_hint: Optional[str]) -> Optional[str]:
"""
Guess currency symbol from extracted price and currency strings.
Return the symbol as found as a string, or None if no symbol is found.
"""
match, _ = _extract_currency_symbol(price, currency_hint)
if match:
return match.group(0)
return None


def extract_price_text(price: str) -> Optional[str]:
def extract_price_text(price: str, currency_match: Optional[Match] = None) -> Optional[str]:
"""
Extract text of a price from a string which contains price and
maybe some other text. If multiple price-looking substrings are present,
the first is returned (FIXME: it is better to return a number
which is near a currency symbol).
maybe some other text.
If a match object of the currency within the `price` string is provided,
amounts before or after the matched currency substring are prioritized.
Otherwise, if multiple price-looking substrings are present, the first is
returned.
>>> extract_price_text("price: $12.99")
'12.99'
Expand Down Expand Up @@ -189,16 +210,39 @@ def extract_price_text(price: str) -> Optional[str]:
""", price, re.VERBOSE)
if m:
return m.group(0).replace(' ', '')

def number_from_match(m):
return m.group(1).strip(',.').strip()

if currency_match is not None:

m = re.search(r"""
(\d[\d\s.,]*) # number, probably with thousand separators
\s*$ # only match right before the currency symbol
""", price[:currency_match.start(0)], re.VERBOSE)
if m:
return number_from_match(m)

m = re.search(r"""
^\s* # only match right after the currency symbol
(\d[\d\s.,]*) # number, probably with thousand separators
\s* # skip whitespace
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
""", price[currency_match.end(0):], re.VERBOSE)
if m:
return number_from_match(m)

m = re.search(r"""
(\d[\d\s.,]*) # number, probably with thousand separators
\s* # skip whitespace
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
""", price, re.VERBOSE)

if m:
return m.group(1).strip(',.').strip()
return number_from_match(m)

if 'free' in price.lower():
return '0'

return None


Expand Down
58 changes: 52 additions & 6 deletions tests/test_price_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,8 +617,6 @@ def __eq__(self, other):
'Р', '30', 30),
Example('€', '€ 139.00',
'€', '139.00', 139),
Example('There are 163 products.', 'From 26 to 50 €',
'€', '26', 26),
Example('Pris NOK 1 999,00', '139,00',
'NOK', '139,00', 139),
Example('/sqft', '1.52',
Expand Down Expand Up @@ -1901,15 +1899,55 @@ def __eq__(self, other):
'CHF', '19.90', 19.90),
Example('', '530,42 Zł',
'Zł', '530,42', 530.42),

# Prefer values next to currency symbols
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
'EUR', '14,85', 14.85),
Example(None, '2 items at 24,00€',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00€ or 30,00€',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00€ or 30,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00€ or €30,00',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00€ or € 30,00',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 € or 30,00€',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 € or 30,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 € or €30,00',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 € or € 30,00',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00 or 30,00€',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00 or 30,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00 or €30,00',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00 or € 30,00',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00 or 30,00€',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00 or 30,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00 or €30,00',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00 or € 30,00',
'€', '24,00', 24.00),
]


PRICE_PARSING_EXAMPLES_XFAIL = [
# amount is picked as a price
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
'EUR', '14,85', 14.85),
Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00',
'$', '60.00', 60),
Example('Цена: уточняйте (мин. заказ: 1 )', 'Цена: уточняйте (мин. заказ: 1 )',
None, None, None),
Example(None, '50 - $2.00 100 - $2.75 400 - $4.50 1,000 - $9.00 2,000 - $17.00 3,000 - $24.00 10,000 - $75.00',
Expand All @@ -1923,6 +1961,14 @@ def __eq__(self, other):
Example('Cuneo', '61.858 L', # Romanian New Leu
'L', '61.858', 61858),

# no handling of price ranges
Example('There are 163 products.', 'From 26 to 50 €',
'€', '26', 26),

# no handling of old-vs-new prices
Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00',
'$', '60.00', 60),

# "р" / "руб" is detected as currency
Example('>', 'См. цену в прайсе',
None, None, None),
Expand Down

0 comments on commit bd856ed

Please sign in to comment.