Implicit Review Loading (#102)

* feat: Implemented preliminary pipeline * LLM experiment on gpt-4o-mini * removed placeholder * Added Gemini support * Fixed crash on hitting gemini rate limit * Implicit review loading * Delete test.py
fani-lab · Nov 13, 2024 · d63e37c · d63e37c
1 parent c261acb
commit d63e37c
Show file tree

Hide file tree

Showing 7 changed files with 207 additions and 14 deletions.
diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,4 @@ output/toy.2016SB5/5.deu_Latn/
 data/raw/google/
 *.rar
 output/toy.2016SB5/5.arb_Arab/
+.env
diff --git a/output/toy.2016SB5/agg.ad.pred.eval.mean.csv b/output/toy.2016SB5/agg.ad.pred.eval.mean.csv
@@ -0,0 +1,21 @@
+metric,5.pes_Arab.zho_Hans.deu_Latn.arb_Arab.fra_Latn.spa_Latn.rnd.0.0
+P_1,0.0666666666666666
+P_5,0.0133333333333333
+P_10,0.0066666666666666
+P_100,0.0006666666666666
+recall_1,0.0666666666666666
+recall_5,0.0666666666666666
+recall_10,0.0666666666666666
+recall_100,0.0666666666666666
+ndcg_cut_1,0.0666666666666666
+ndcg_cut_5,0.0666666666666666
+ndcg_cut_10,0.0666666666666666
+ndcg_cut_100,0.0666666666666666
+map_cut_1,0.0666666666666666
+map_cut_5,0.0666666666666666
+map_cut_10,0.0666666666666666
+map_cut_100,0.0666666666666666
+success_1,0.0666666666666666
+success_5,0.0666666666666666
+success_10,0.0666666666666666
+success_100,0.0666666666666666
diff --git a/requirements.txt b/requirements.txt
@@ -37,4 +37,9 @@ fasttext @ git+https://github.com/facebookresearch/fastText.git
 
 pampy==0.3.0
 returns==0.22.0
-more-itertools==10.1.0
+more-itertools==10.1.0
+
+# needed for implicit dataset generation
+openai
+pytest
+python-dotenv
diff --git a/src/cmn/review.py b/src/cmn/review.py
@@ -48,7 +48,8 @@ def __init__(self,
                  lempos: Optional[str] = None,
                  parent = None,
                  lang='eng_Latn',
-                 category: Optional[str] = None
+                 category: Optional[str] = None,
+                 implicit: List[bool] = []
     ):
         self.id = id
         self.sentences = sentences #list of sentences of list of tokens
@@ -58,6 +59,8 @@ def __init__(self,
         self.lempos = lempos
         self.lang = lang
         self.category = category
+        if not implicit: implicit = [False] * len(self.aos)
+        self.implicit = implicit
 
         self.parent = parent
         self.augs: Augmentation = {} #distionary of translated and backtranslated augmentations of this review in object format, e.g.,
@@ -85,7 +88,11 @@ def to_dict(self, w_augs=False):
     def get_aos(self) -> List[List[AspectOpinionSentiment]]:
         r = []
         if not self.aos: return r
-        for i, aos in enumerate(self.aos): r.append([([self.sentences[i][j] for j in a], [self.sentences[i][j] for j in o], s) for (a, o, s) in aos])
+        for i, aos in enumerate(self.aos):
+            if self.implicit[i]:
+                r.append([([None], [self.sentences[i][j] for j in o], s) for (a, o, s) in aos])
+            else:
+                r.append([([self.sentences[i][j] for j in a], [self.sentences[i][j] for j in o], s) for (a, o, s) in aos])
         return r
 
     def get_txt(self): return '. '.join(' '.join(s) for s in self.sentences)

diff --git a/src/cmn/semeval.py b/src/cmn/semeval.py
@@ -11,8 +11,8 @@ class SemEvalReview(Review):
     def __init__(self, id, sentences, time, author, aos): super().__init__(self, id, sentences, time, author, aos)
 
     @staticmethod
-    def load(path):
-        if str(path).endswith('.xml'): return SemEvalReview._xmlloader(path)
+    def load(path, explicit=True, implicit=False):
+        if str(path).endswith('.xml'): return SemEvalReview._xmlloader(path, explicit, implicit)
         return SemEvalReview._txtloader(input)
 
     @staticmethod
@@ -32,11 +32,11 @@ def _txtloader(path):
         return reviews
 
     @staticmethod
-    def _xmlloader(path):
+    def _xmlloader(path, explicit, implicit):
         reviews_list = []
         xtree = et.parse(path).getroot()
-        if xtree.tag == 'Reviews':   reviews = [SemEvalReview._parse(xsentence) for xreview in tqdm(xtree) for xsentences in xreview for xsentence in xsentences]
-        if xtree.tag == 'sentences': reviews = [SemEvalReview._parse(xsentence) for xsentence in tqdm(xtree)]
+        if xtree.tag == 'Reviews':   reviews = [SemEvalReview._parse(xsentence, explicit, implicit) for xreview in tqdm(xtree) for xsentences in xreview for xsentence in xsentences]
+        if xtree.tag == 'sentences': reviews = [SemEvalReview._parse(xsentence, explicit, implicit) for xsentence in tqdm(xtree)]
 
         return [r for r in reviews if r]
 
@@ -56,15 +56,17 @@ def _map_idx(aspect, text):
         return [i for i in range(len(text_tokens), len(text_tokens) + len(aspect_tokens))]
 
     @staticmethod
-    def _parse(xsentence):
+    def _parse(xsentence, explicit, implicit):
         id = xsentence.attrib["id"]
         aos = []; aos_cats = []
         for element in xsentence:
             if element.tag == 'text': sentence = element.text # we consider each sentence as a signle review
             elif element.tag == 'Opinions':#semeval-15-16
                 #<Opinion target="place" category="RESTAURANT#GENERAL" polarity="positive" from="5" to="10"/>
                 for opinion in element:
-                    if opinion.attrib["target"] == 'NULL': continue
+                    # Load implicit, explicit, or both aspects
+                    if not implicit and opinion.attrib["target"] == 'NULL': continue
+                    if not explicit and opinion.attrib["target"] != 'NULL': continue
                     # we may have duplicates for the same aspect due to being in different category like in semeval 2016's <sentence id="1064477:4">
                     aspect = (opinion.attrib["target"], int(opinion.attrib["from"]), int(opinion.attrib["to"])) #('place', 5, 10)
                     # we need to map char index to token index in aspect
@@ -78,7 +80,9 @@ def _parse(xsentence):
             elif element.tag == 'aspectTerms':#semeval-14
                 #<aspectTerm term="table" polarity="neutral" from="5" to="10"/>
                 for opinion in element:
-                    if opinion.attrib["term"] == 'NULL': continue
+                    # Load implicit, explicit, or both aspects
+                    if not implicit and opinion.attrib["term"] == 'NULL': continue
+                    if not explicit and opinion.attrib["term"] != 'NULL': continue
                     # we may have duplicates for the same aspect due to being in different category like in semeval 2016's <sentence id="1064477:4">
                     aspect = (opinion.attrib["term"], int(opinion.attrib["from"]), int(opinion.attrib["to"])) #('place', 5, 10)
                     # we need to map char index to token index in aspect
@@ -93,15 +97,24 @@ def _parse(xsentence):
                     #<aspectCategory category="food" polarity="neutral"/>
                     aos_cats.append(opinion.attrib["category"])
 
+        # Mark all aos with implicit aspects
+        implicit_arr = [False] * len(aos)
+        if implicit:
+            for i, (idxlist, o, s, aspect_token) in enumerate(aos):
+                if aspect_token == 'NULL': implicit_arr[i] = True
+
         #sentence = nlp(sentence) # as it does some processing, it destroys the token idx for aspect term
         tokens = sentence.split()
         # to fix ",a b c," to "a b c"
         # to fix '"sales" team' to 'sales team' => semeval-14-labptop-<sentence id="1316">
         # todo: fix 'Food-awesome.' to 'food awesome' => semeval-14-restaurant-<sentence id="1817">
         for i, (idxlist, o, s, aspect_token) in enumerate(aos):
-            for j, idx in enumerate(idxlist): tokens[idx] = aspect_token.split()[j].replace('"', '')
-            aos[i] = (idxlist, o, s)
+            for j, idx in enumerate(idxlist):
+                if not implicit_arr[i]:
+                    tokens[idx] = aspect_token.split()[j].replace('"', '')
+                aos[i] = (idxlist, o, s)
+
         return Review(id=id, sentences=[[str(t).lower() for t in tokens]], time=None, author=None,
                       aos=[aos], lempos=None,
-                      parent=None, lang='eng_Latn', category=aos_cats) if aos else None
+                      parent=None, lang='eng_Latn', category=aos_cats, implicit=implicit_arr) if aos else None
 
diff --git a/tests/ev_implicit_reviews.py b/tests/ev_implicit_reviews.py
@@ -0,0 +1,86 @@
+"""Expected values for test cases of implicit/explicit review loading"""
+
+# Expected first, last, and amount of reviews with implicit aspects"""
+SEMEVAL_IMPLICIT = {
+    "first": {
+        'id': '1004293:2',
+        'text': 'they never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.',
+        'sentences': [
+            ['they', 'never', 'brought', 'us', 'complimentary', 'noodles,', 'ignored', 'repeated', 'requests', 'for', 'sugar,', 'and', 'threw', 'our', 'dishes', 'on', 'the', 'table.']
+        ],
+        'aos': [[([None], [], '-1')]],
+        'lang': 'eng_Latn',
+        'orig': True
+    },
+    "last": {
+        'id': '1058221:7',
+        'text': 'the last time i walked by it looked pretty empty. hmmm.', 
+        'sentences': [
+            ['the', 'last', 'time', 'i', 'walked', 'by', 'it', 'looked', 'pretty', 'empty.', 'hmmm.']
+        ], 
+        'aos': [[([None], [], '-1')]], 
+        'lang': 'eng_Latn', 
+        'orig': True
+    },
+    "count": 10,
+}
+
+# Expected first, last, and amount of reviews with explicit aspects
+SEMEVAL_EXPLICIT = {
+    "first": {
+        'id': '1004293:0',
+        'text': 'judging from previous posts of test this used to be a good place but not any longer.',
+        'sentences': [
+            ['judging', 'from', 'previous', 'posts', 'of', 'test', 'this', 'used', 'to', 'be', 'a', 'good', 'place', 'but', 'not', 'any', 'longer.']
+        ],
+        'aos': [
+            [(['posts', 'of', 'test'], [], '-1'),
+             (['place'], [], '-1')]
+        ],
+        'lang': 'eng_Latn',
+        'orig': True
+    },
+    "last": {
+        'id': '1058221:4',
+        'text': 'i happen to have a policy that goes along with a little bit of self-respect, which includes not letting a waiter intimidate me, i.e. make me feel bad asking for trivialities like water, or the check.',
+        'sentences': [
+            ['i', 'happen', 'to', 'have', 'a', 'policy', 'that', 'goes', 'along', 'with', 'a', 'little', 'bit', 'of', 'self-respect,', 'which', 'includes', 'not', 'letting', 'a', 'waiter', 'intimidate', 'me,', 'i.e.', 'make', 'me', 'feel', 'bad', 'asking', 'for', 'trivialities', 'like', 'water,', 'or', 'the', 'check.']
+        ],
+        'aos': [[(['waiter'], [], '-1')]],
+        'lang': 'eng_Latn',
+        'orig': True
+    },
+    "count": 18,
+}
+
+# Expected first, last, and amount of reviews with implicit and explicit aspects"""
+SEMEVAL_BOTH = {
+    "first": {
+        'id': '1004293:0',
+        'text': 'judging from previous posts of test this used to be a good place but not any longer.',
+        'sentences': [
+            ['judging', 'from', 'previous', 'posts', 'of', 'test', 'this', 'used', 'to', 'be', 'a', 'good', 'place', 'but', 'not', 'any', 'longer.']
+        ],
+        'aos': [
+            [(['posts', 'of', 'test'], [], '-1'),
+             (['place'], [], '-1')]
+        ],
+        'lang': 'eng_Latn',
+        'orig': True
+    },
+    "last": {
+        'id': '1058221:7',
+        'text': 'the last time i walked by it looked pretty empty. hmmm.', 
+        'sentences': [
+            ['the', 'last', 'time', 'i', 'walked', 'by', 'it', 'looked', 'pretty', 'empty.', 'hmmm.']
+        ], 
+        'aos': [[([None], [], '-1')]], 
+        'lang': 'eng_Latn', 
+        'orig': True
+    },
+    "count": 26,
+}
+
+SEMEVAL_NULL = {
+    "count": 0,
+}
diff --git a/tests/test_implicit_reviews.py b/tests/test_implicit_reviews.py
@@ -0,0 +1,60 @@
+"""Unit tests for implicit review loading"""
+import sys
+import os
+import pytest
+from ev_implicit_reviews import SEMEVAL_EXPLICIT, SEMEVAL_IMPLICIT, SEMEVAL_BOTH, SEMEVAL_NULL
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../src')))
+
+from cmn.review import Review
+from cmn.semeval import SemEvalReview
+from cmn.twitter import TwitterReview # Currently, no twitter reviews have implicit aspects
+
+SEMEVAL_PATH = "./data/raw/semeval/toy.2016SB5/ABSA16_Restaurants_Train_SB1_v2.xml"
+
+@pytest.mark.parametrize("path, expected", [
+    (SEMEVAL_PATH, SEMEVAL_IMPLICIT),
+])
+def test_implicit(path, expected):
+    """Test loading implicit aspect containing reviews."""
+    reviews: list[Review] = SemEvalReview.load(path, explicit=False, implicit=True)
+    first = reviews[0].to_dict()[0]
+    last = reviews[-1].to_dict()[0]
+    count = len(reviews)
+    assert first == expected["first"]
+    assert last == expected["last"]
+    assert count == expected["count"]
+
+@pytest.mark.parametrize("path, expected", [
+    (SEMEVAL_PATH, SEMEVAL_EXPLICIT),
+])
+def test_explicit(path, expected):
+    """Test loading explicit aspect containing reviews."""
+    reviews: list[Review] = SemEvalReview.load(path, explicit=True, implicit=False)
+    first = reviews[0].to_dict()[0]
+    last = reviews[-1].to_dict()[0]
+    count = len(reviews)
+    assert first == expected["first"]
+    assert last == expected["last"]
+    assert count == expected["count"]
+
+@pytest.mark.parametrize("path, expected", [
+    (SEMEVAL_PATH, SEMEVAL_BOTH),
+])
+def test_implicit_and_explicit(path, expected):
+    """Test loading both implicit and explicit reviews."""
+    reviews: list[Review] = SemEvalReview.load(path, explicit=True, implicit=True)
+    first = reviews[0].to_dict()[0]
+    last = reviews[-1].to_dict()[0]
+    count = len(reviews)
+    assert first == expected["first"]
+    assert last == expected["last"]
+    assert count == expected["count"]
+
+@pytest.mark.parametrize("path, expected", [
+    (SEMEVAL_PATH, SEMEVAL_NULL),
+])
+def test_null(path, expected):
+    """Test loading neither implicit nor explicit reviews."""
+    reviews: list[Review] = SemEvalReview.load(path, explicit=False, implicit=False)
+    count = len(reviews)
+    assert count == expected["count"]
-Original file line number
+Diff line change
@@ Expand Up / @@ -43,3 +43,4 @@ output/toy.2016SB5/5.deu_Latn/ @@
     data/raw/google/
     *.rar
     output/toy.2016SB5/5.arb_Arab/
+    .env