Skip to content

Commit

Permalink
Implicit Review Loading (#102)
Browse files Browse the repository at this point in the history
* feat: Implemented preliminary pipeline

* LLM experiment on gpt-4o-mini

* removed placeholder

* Added Gemini support

* Fixed crash on hitting gemini rate limit

* Implicit review loading

* Delete test.py
  • Loading branch information
CalderJohnson authored Nov 13, 2024
1 parent c261acb commit d63e37c
Show file tree
Hide file tree
Showing 7 changed files with 207 additions and 14 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,4 @@ output/toy.2016SB5/5.deu_Latn/
data/raw/google/
*.rar
output/toy.2016SB5/5.arb_Arab/
.env
21 changes: 21 additions & 0 deletions output/toy.2016SB5/agg.ad.pred.eval.mean.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
metric,5.pes_Arab.zho_Hans.deu_Latn.arb_Arab.fra_Latn.spa_Latn.rnd.0.0
P_1,0.0666666666666666
P_5,0.0133333333333333
P_10,0.0066666666666666
P_100,0.0006666666666666
recall_1,0.0666666666666666
recall_5,0.0666666666666666
recall_10,0.0666666666666666
recall_100,0.0666666666666666
ndcg_cut_1,0.0666666666666666
ndcg_cut_5,0.0666666666666666
ndcg_cut_10,0.0666666666666666
ndcg_cut_100,0.0666666666666666
map_cut_1,0.0666666666666666
map_cut_5,0.0666666666666666
map_cut_10,0.0666666666666666
map_cut_100,0.0666666666666666
success_1,0.0666666666666666
success_5,0.0666666666666666
success_10,0.0666666666666666
success_100,0.0666666666666666
7 changes: 6 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,9 @@ fasttext @ git+https://github.com/facebookresearch/fastText.git

pampy==0.3.0
returns==0.22.0
more-itertools==10.1.0
more-itertools==10.1.0

# needed for implicit dataset generation
openai
pytest
python-dotenv
11 changes: 9 additions & 2 deletions src/cmn/review.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def __init__(self,
lempos: Optional[str] = None,
parent = None,
lang='eng_Latn',
category: Optional[str] = None
category: Optional[str] = None,
implicit: List[bool] = []
):
self.id = id
self.sentences = sentences #list of sentences of list of tokens
Expand All @@ -58,6 +59,8 @@ def __init__(self,
self.lempos = lempos
self.lang = lang
self.category = category
if not implicit: implicit = [False] * len(self.aos)
self.implicit = implicit

self.parent = parent
self.augs: Augmentation = {} #distionary of translated and backtranslated augmentations of this review in object format, e.g.,
Expand Down Expand Up @@ -85,7 +88,11 @@ def to_dict(self, w_augs=False):
def get_aos(self) -> List[List[AspectOpinionSentiment]]:
r = []
if not self.aos: return r
for i, aos in enumerate(self.aos): r.append([([self.sentences[i][j] for j in a], [self.sentences[i][j] for j in o], s) for (a, o, s) in aos])
for i, aos in enumerate(self.aos):
if self.implicit[i]:
r.append([([None], [self.sentences[i][j] for j in o], s) for (a, o, s) in aos])
else:
r.append([([self.sentences[i][j] for j in a], [self.sentences[i][j] for j in o], s) for (a, o, s) in aos])
return r

def get_txt(self): return '. '.join(' '.join(s) for s in self.sentences)
Expand Down
35 changes: 24 additions & 11 deletions src/cmn/semeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ class SemEvalReview(Review):
def __init__(self, id, sentences, time, author, aos): super().__init__(self, id, sentences, time, author, aos)

@staticmethod
def load(path):
if str(path).endswith('.xml'): return SemEvalReview._xmlloader(path)
def load(path, explicit=True, implicit=False):
if str(path).endswith('.xml'): return SemEvalReview._xmlloader(path, explicit, implicit)
return SemEvalReview._txtloader(input)

@staticmethod
Expand All @@ -32,11 +32,11 @@ def _txtloader(path):
return reviews

@staticmethod
def _xmlloader(path):
def _xmlloader(path, explicit, implicit):
reviews_list = []
xtree = et.parse(path).getroot()
if xtree.tag == 'Reviews': reviews = [SemEvalReview._parse(xsentence) for xreview in tqdm(xtree) for xsentences in xreview for xsentence in xsentences]
if xtree.tag == 'sentences': reviews = [SemEvalReview._parse(xsentence) for xsentence in tqdm(xtree)]
if xtree.tag == 'Reviews': reviews = [SemEvalReview._parse(xsentence, explicit, implicit) for xreview in tqdm(xtree) for xsentences in xreview for xsentence in xsentences]
if xtree.tag == 'sentences': reviews = [SemEvalReview._parse(xsentence, explicit, implicit) for xsentence in tqdm(xtree)]

return [r for r in reviews if r]

Expand All @@ -56,15 +56,17 @@ def _map_idx(aspect, text):
return [i for i in range(len(text_tokens), len(text_tokens) + len(aspect_tokens))]

@staticmethod
def _parse(xsentence):
def _parse(xsentence, explicit, implicit):
id = xsentence.attrib["id"]
aos = []; aos_cats = []
for element in xsentence:
if element.tag == 'text': sentence = element.text # we consider each sentence as a signle review
elif element.tag == 'Opinions':#semeval-15-16
#<Opinion target="place" category="RESTAURANT#GENERAL" polarity="positive" from="5" to="10"/>
for opinion in element:
if opinion.attrib["target"] == 'NULL': continue
# Load implicit, explicit, or both aspects
if not implicit and opinion.attrib["target"] == 'NULL': continue
if not explicit and opinion.attrib["target"] != 'NULL': continue
# we may have duplicates for the same aspect due to being in different category like in semeval 2016's <sentence id="1064477:4">
aspect = (opinion.attrib["target"], int(opinion.attrib["from"]), int(opinion.attrib["to"])) #('place', 5, 10)
# we need to map char index to token index in aspect
Expand All @@ -78,7 +80,9 @@ def _parse(xsentence):
elif element.tag == 'aspectTerms':#semeval-14
#<aspectTerm term="table" polarity="neutral" from="5" to="10"/>
for opinion in element:
if opinion.attrib["term"] == 'NULL': continue
# Load implicit, explicit, or both aspects
if not implicit and opinion.attrib["term"] == 'NULL': continue
if not explicit and opinion.attrib["term"] != 'NULL': continue
# we may have duplicates for the same aspect due to being in different category like in semeval 2016's <sentence id="1064477:4">
aspect = (opinion.attrib["term"], int(opinion.attrib["from"]), int(opinion.attrib["to"])) #('place', 5, 10)
# we need to map char index to token index in aspect
Expand All @@ -93,15 +97,24 @@ def _parse(xsentence):
#<aspectCategory category="food" polarity="neutral"/>
aos_cats.append(opinion.attrib["category"])

# Mark all aos with implicit aspects
implicit_arr = [False] * len(aos)
if implicit:
for i, (idxlist, o, s, aspect_token) in enumerate(aos):
if aspect_token == 'NULL': implicit_arr[i] = True

#sentence = nlp(sentence) # as it does some processing, it destroys the token idx for aspect term
tokens = sentence.split()
# to fix ",a b c," to "a b c"
# to fix '"sales" team' to 'sales team' => semeval-14-labptop-<sentence id="1316">
# todo: fix 'Food-awesome.' to 'food awesome' => semeval-14-restaurant-<sentence id="1817">
for i, (idxlist, o, s, aspect_token) in enumerate(aos):
for j, idx in enumerate(idxlist): tokens[idx] = aspect_token.split()[j].replace('"', '')
aos[i] = (idxlist, o, s)
for j, idx in enumerate(idxlist):
if not implicit_arr[i]:
tokens[idx] = aspect_token.split()[j].replace('"', '')
aos[i] = (idxlist, o, s)

return Review(id=id, sentences=[[str(t).lower() for t in tokens]], time=None, author=None,
aos=[aos], lempos=None,
parent=None, lang='eng_Latn', category=aos_cats) if aos else None
parent=None, lang='eng_Latn', category=aos_cats, implicit=implicit_arr) if aos else None

86 changes: 86 additions & 0 deletions tests/ev_implicit_reviews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Expected values for test cases of implicit/explicit review loading"""

# Expected first, last, and amount of reviews with implicit aspects"""
SEMEVAL_IMPLICIT = {
"first": {
'id': '1004293:2',
'text': 'they never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.',
'sentences': [
['they', 'never', 'brought', 'us', 'complimentary', 'noodles,', 'ignored', 'repeated', 'requests', 'for', 'sugar,', 'and', 'threw', 'our', 'dishes', 'on', 'the', 'table.']
],
'aos': [[([None], [], '-1')]],
'lang': 'eng_Latn',
'orig': True
},
"last": {
'id': '1058221:7',
'text': 'the last time i walked by it looked pretty empty. hmmm.',
'sentences': [
['the', 'last', 'time', 'i', 'walked', 'by', 'it', 'looked', 'pretty', 'empty.', 'hmmm.']
],
'aos': [[([None], [], '-1')]],
'lang': 'eng_Latn',
'orig': True
},
"count": 10,
}

# Expected first, last, and amount of reviews with explicit aspects
SEMEVAL_EXPLICIT = {
"first": {
'id': '1004293:0',
'text': 'judging from previous posts of test this used to be a good place but not any longer.',
'sentences': [
['judging', 'from', 'previous', 'posts', 'of', 'test', 'this', 'used', 'to', 'be', 'a', 'good', 'place', 'but', 'not', 'any', 'longer.']
],
'aos': [
[(['posts', 'of', 'test'], [], '-1'),
(['place'], [], '-1')]
],
'lang': 'eng_Latn',
'orig': True
},
"last": {
'id': '1058221:4',
'text': 'i happen to have a policy that goes along with a little bit of self-respect, which includes not letting a waiter intimidate me, i.e. make me feel bad asking for trivialities like water, or the check.',
'sentences': [
['i', 'happen', 'to', 'have', 'a', 'policy', 'that', 'goes', 'along', 'with', 'a', 'little', 'bit', 'of', 'self-respect,', 'which', 'includes', 'not', 'letting', 'a', 'waiter', 'intimidate', 'me,', 'i.e.', 'make', 'me', 'feel', 'bad', 'asking', 'for', 'trivialities', 'like', 'water,', 'or', 'the', 'check.']
],
'aos': [[(['waiter'], [], '-1')]],
'lang': 'eng_Latn',
'orig': True
},
"count": 18,
}

# Expected first, last, and amount of reviews with implicit and explicit aspects"""
SEMEVAL_BOTH = {
"first": {
'id': '1004293:0',
'text': 'judging from previous posts of test this used to be a good place but not any longer.',
'sentences': [
['judging', 'from', 'previous', 'posts', 'of', 'test', 'this', 'used', 'to', 'be', 'a', 'good', 'place', 'but', 'not', 'any', 'longer.']
],
'aos': [
[(['posts', 'of', 'test'], [], '-1'),
(['place'], [], '-1')]
],
'lang': 'eng_Latn',
'orig': True
},
"last": {
'id': '1058221:7',
'text': 'the last time i walked by it looked pretty empty. hmmm.',
'sentences': [
['the', 'last', 'time', 'i', 'walked', 'by', 'it', 'looked', 'pretty', 'empty.', 'hmmm.']
],
'aos': [[([None], [], '-1')]],
'lang': 'eng_Latn',
'orig': True
},
"count": 26,
}

SEMEVAL_NULL = {
"count": 0,
}
60 changes: 60 additions & 0 deletions tests/test_implicit_reviews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Unit tests for implicit review loading"""
import sys
import os
import pytest
from ev_implicit_reviews import SEMEVAL_EXPLICIT, SEMEVAL_IMPLICIT, SEMEVAL_BOTH, SEMEVAL_NULL
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../src')))

from cmn.review import Review
from cmn.semeval import SemEvalReview
from cmn.twitter import TwitterReview # Currently, no twitter reviews have implicit aspects

SEMEVAL_PATH = "./data/raw/semeval/toy.2016SB5/ABSA16_Restaurants_Train_SB1_v2.xml"

@pytest.mark.parametrize("path, expected", [
(SEMEVAL_PATH, SEMEVAL_IMPLICIT),
])
def test_implicit(path, expected):
"""Test loading implicit aspect containing reviews."""
reviews: list[Review] = SemEvalReview.load(path, explicit=False, implicit=True)
first = reviews[0].to_dict()[0]
last = reviews[-1].to_dict()[0]
count = len(reviews)
assert first == expected["first"]
assert last == expected["last"]
assert count == expected["count"]

@pytest.mark.parametrize("path, expected", [
(SEMEVAL_PATH, SEMEVAL_EXPLICIT),
])
def test_explicit(path, expected):
"""Test loading explicit aspect containing reviews."""
reviews: list[Review] = SemEvalReview.load(path, explicit=True, implicit=False)
first = reviews[0].to_dict()[0]
last = reviews[-1].to_dict()[0]
count = len(reviews)
assert first == expected["first"]
assert last == expected["last"]
assert count == expected["count"]

@pytest.mark.parametrize("path, expected", [
(SEMEVAL_PATH, SEMEVAL_BOTH),
])
def test_implicit_and_explicit(path, expected):
"""Test loading both implicit and explicit reviews."""
reviews: list[Review] = SemEvalReview.load(path, explicit=True, implicit=True)
first = reviews[0].to_dict()[0]
last = reviews[-1].to_dict()[0]
count = len(reviews)
assert first == expected["first"]
assert last == expected["last"]
assert count == expected["count"]

@pytest.mark.parametrize("path, expected", [
(SEMEVAL_PATH, SEMEVAL_NULL),
])
def test_null(path, expected):
"""Test loading neither implicit nor explicit reviews."""
reviews: list[Review] = SemEvalReview.load(path, explicit=False, implicit=False)
count = len(reviews)
assert count == expected["count"]

0 comments on commit d63e37c

Please sign in to comment.