Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prioritize numbers next to currencies #5

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 64 additions & 21 deletions price_parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import re
import string
from typing import Callable, Optional, Pattern, List, Tuple
from typing import Callable, Match, Optional, Pattern, List, Tuple
from decimal import Decimal, InvalidOperation

import attr
Expand Down Expand Up @@ -37,14 +37,20 @@ def fromstring(cls, price: Optional[str],
``price`` string, it could be **preferred** over a value extracted
from ``currency_hint`` string.
"""
amount_text = extract_price_text(price) if price is not None else None
currency_match, source = _extract_currency_symbol(price, currency_hint)
if price is not None:
_currency_match = currency_match if source == price else None
amount_text = extract_price_text(price, _currency_match)
else:
amount_text = None
amount_num = (
parse_number(amount_text, decimal_separator)
if amount_text is not None else None
)
currency = extract_currency_symbol(price, currency_hint)
if currency is not None:
currency = currency.strip()
if currency_match is not None:
currency = currency_match.group(0).strip()
else:
currency = None
return Price(
amount=amount_num,
currency=currency,
Expand Down Expand Up @@ -129,11 +135,11 @@ def or_regex(symbols: List[str]) -> Pattern:
_search_unsafe_currency = or_regex(OTHER_CURRENCY_SYMBOLS).search


def extract_currency_symbol(price: Optional[str],
currency_hint: Optional[str]) -> Optional[str]:
def _extract_currency_symbol(price: Optional[str], currency_hint: Optional[str]) -> Tuple[Optional[Match], Optional[str]]:
"""
Guess currency symbol from extracted price and currency strings.
Return an empty string if symbol is not found.
Guess the currency symbol from extracted price and currency strings.
Return a (`match object`_, source_string) tuple with the symbol found and
the string where it was found, or (None, None) if no symbol is found.
"""
methods: List[Tuple[Callable, Optional[str]]] = [
(_search_safe_currency, price),
Expand All @@ -151,17 +157,32 @@ def extract_currency_symbol(price: Optional[str],
for meth, attr in methods:
m = meth(attr) if attr else None
if m:
return m.group(0)
return m, attr

return None, None


def extract_currency_symbol(price: Optional[str],
currency_hint: Optional[str]) -> Optional[str]:
"""
Guess currency symbol from extracted price and currency strings.
Return the symbol as found as a string, or None if no symbol is found.
"""
match, _ = _extract_currency_symbol(price, currency_hint)
if match:
return match.group(0)
return None


def extract_price_text(price: str) -> Optional[str]:
def extract_price_text(price: str, currency_match: Optional[Match] = None) -> Optional[str]:
"""
Extract text of a price from a string which contains price and
maybe some other text. If multiple price-looking substrings are present,
the first is returned (FIXME: it is better to return a number
which is near a currency symbol).
maybe some other text.

If a match object of the currency within the `price` string is provided,
amounts before or after the matched currency substring are prioritized.
Otherwise, if multiple price-looking substrings are present, the first is
returned.

>>> extract_price_text("price: $12.99")
'12.99'
Expand Down Expand Up @@ -205,21 +226,43 @@ def extract_price_text(price: str) -> Optional[str]:
if m:
return m.group(0).replace(' ', '')

m = re.search(r"""
([.]?\d[\d\s.,]*) # number, probably with thousand separators
\s*? # skip whitespace
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
""", price, re.VERBOSE)

if m:
def number_from_match(m):
price_text = m.group(1).rstrip(',.')
return (
price_text.strip()
if price_text.count('.') == 1
else price_text.lstrip(',.').strip()
)

if currency_match is not None:

m = re.search(r"""
(\d[\d\s.,]*) # number, probably with thousand separators
\s*$ # only match right before the currency symbol
""", price[:currency_match.start(0)], re.VERBOSE)
if m:
return number_from_match(m)

m = re.search(r"""
^\s* # only match right after the currency symbol
(\d[\d\s.,]*) # number, probably with thousand separators
\s* # skip whitespace
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
""", price[currency_match.end(0):], re.VERBOSE)
if m:
return number_from_match(m)

m = re.search(r"""
([.]?\d[\d\s.,]*) # number, probably with thousand separators
\s*? # skip whitespace
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
""", price, re.VERBOSE)
if m:
return number_from_match(m)

if 'free' in price.lower():
return '0'

return None


Expand Down
83 changes: 77 additions & 6 deletions tests/test_price_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import pytest

from price_parser import Price
from price_parser.parser import extract_currency_symbol


class Example(Price):
Expand Down Expand Up @@ -647,8 +648,6 @@ def __eq__(self, other):
'Р', '30', 30),
Example('€', '€ 139.00',
'€', '139.00', 139),
Example('There are 163 products.', 'From 26 to 50 €',
'€', '26', 26),
Example('Pris NOK 1 999,00', '139,00',
'NOK', '139,00', 139),
Example('/sqft', '1.52',
Expand Down Expand Up @@ -1935,15 +1934,55 @@ def __eq__(self, other):
'CHF', '19.90', 19.90),
Example('', '530,42 Zł',
'Zł', '530,42', 530.42),

# Prefer values next to currency symbols
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
'EUR', '14,85', 14.85),
Example(None, '2 items at 24,00€',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00€ or 30,00€',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00€ or 30,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00€ or €30,00',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00€ or € 30,00',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 € or 30,00€',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 € or 30,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 € or €30,00',
'€', '24,00', 24.00),
Example(None, '2 items at 24,00 € or € 30,00',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00 or 30,00€',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00 or 30,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00 or €30,00',
'€', '24,00', 24.00),
Example(None, '2 items at €24,00 or € 30,00',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00 or 30,00€',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00 or 30,00 €',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00 or €30,00',
'€', '24,00', 24.00),
Example(None, '2 items at € 24,00 or € 30,00',
'€', '24,00', 24.00),
]


PRICE_PARSING_EXAMPLES_XFAIL = [
# amount is picked as a price
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
'EUR', '14,85', 14.85),
Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00',
'$', '60.00', 60),
Example('Цена: уточняйте (мин. заказ: 1 )', 'Цена: уточняйте (мин. заказ: 1 )',
None, None, None),
Example(None, '50 - $2.00 100 - $2.75 400 - $4.50 1,000 - $9.00 2,000 - $17.00 3,000 - $24.00 10,000 - $75.00',
Expand All @@ -1957,6 +1996,14 @@ def __eq__(self, other):
Example('Cuneo', '61.858 L', # Romanian New Leu
'L', '61.858', 61858),

# no handling of price ranges
Example('There are 163 products.', 'From 26 to 50 €',
'€', '26', 26),

# no handling of old-vs-new prices
Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00',
'$', '60.00', 60),

# "р" / "руб" is detected as currency
Example('>', 'См. цену в прайсе',
None, None, None),
Expand Down Expand Up @@ -2026,6 +2073,30 @@ def test_parsing(example: Example):
assert parsed == example, f"Failed scenario: price={example.price_raw}, currency_hint={example.currency_raw}"


@pytest.mark.parametrize(
"input_string,symbol",
(
# no currency
('', None),
('1', None),

# fictional currency
('10 eddies', None),

# currency code
('5 CNY', 'CNY'),

# currency name
('5 euros', 'euro'),

# currency symbol
('$4', '$'),
)
)
def test_extract_currency_symbol(input_string, symbol):
assert extract_currency_symbol(input_string, None) == symbol


@pytest.mark.parametrize(
"amount,amount_float",
(
Expand Down