diff --git a/.gitignore b/.gitignore
index 7851964..9033685 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,4 @@ output/toy.2016SB5/5.deu_Latn/
data/raw/google/
*.rar
output/toy.2016SB5/5.arb_Arab/
+.env
diff --git a/output/toy.2016SB5/agg.ad.pred.eval.mean.csv b/output/toy.2016SB5/agg.ad.pred.eval.mean.csv
new file mode 100644
index 0000000..8de77db
--- /dev/null
+++ b/output/toy.2016SB5/agg.ad.pred.eval.mean.csv
@@ -0,0 +1,21 @@
+metric,5.pes_Arab.zho_Hans.deu_Latn.arb_Arab.fra_Latn.spa_Latn.rnd.0.0
+P_1,0.0666666666666666
+P_5,0.0133333333333333
+P_10,0.0066666666666666
+P_100,0.0006666666666666
+recall_1,0.0666666666666666
+recall_5,0.0666666666666666
+recall_10,0.0666666666666666
+recall_100,0.0666666666666666
+ndcg_cut_1,0.0666666666666666
+ndcg_cut_5,0.0666666666666666
+ndcg_cut_10,0.0666666666666666
+ndcg_cut_100,0.0666666666666666
+map_cut_1,0.0666666666666666
+map_cut_5,0.0666666666666666
+map_cut_10,0.0666666666666666
+map_cut_100,0.0666666666666666
+success_1,0.0666666666666666
+success_5,0.0666666666666666
+success_10,0.0666666666666666
+success_100,0.0666666666666666
diff --git a/requirements.txt b/requirements.txt
index e095814..d240411 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,4 +37,9 @@ fasttext @ git+https://github.com/facebookresearch/fastText.git
pampy==0.3.0
returns==0.22.0
-more-itertools==10.1.0
\ No newline at end of file
+more-itertools==10.1.0
+
+# needed for implicit dataset generation
+openai
+pytest
+python-dotenv
\ No newline at end of file
diff --git a/src/cmn/review.py b/src/cmn/review.py
index b895f84..a0b9ab2 100644
--- a/src/cmn/review.py
+++ b/src/cmn/review.py
@@ -48,7 +48,8 @@ def __init__(self,
lempos: Optional[str] = None,
parent = None,
lang='eng_Latn',
- category: Optional[str] = None
+ category: Optional[str] = None,
+ implicit: List[bool] = []
):
self.id = id
self.sentences = sentences #list of sentences of list of tokens
@@ -58,6 +59,8 @@ def __init__(self,
self.lempos = lempos
self.lang = lang
self.category = category
+ if not implicit: implicit = [False] * len(self.aos)
+ self.implicit = implicit
self.parent = parent
self.augs: Augmentation = {} #distionary of translated and backtranslated augmentations of this review in object format, e.g.,
@@ -85,7 +88,11 @@ def to_dict(self, w_augs=False):
def get_aos(self) -> List[List[AspectOpinionSentiment]]:
r = []
if not self.aos: return r
- for i, aos in enumerate(self.aos): r.append([([self.sentences[i][j] for j in a], [self.sentences[i][j] for j in o], s) for (a, o, s) in aos])
+ for i, aos in enumerate(self.aos):
+ if self.implicit[i]:
+ r.append([([None], [self.sentences[i][j] for j in o], s) for (a, o, s) in aos])
+ else:
+ r.append([([self.sentences[i][j] for j in a], [self.sentences[i][j] for j in o], s) for (a, o, s) in aos])
return r
def get_txt(self): return '. '.join(' '.join(s) for s in self.sentences)
diff --git a/src/cmn/semeval.py b/src/cmn/semeval.py
index 3222684..da3debd 100644
--- a/src/cmn/semeval.py
+++ b/src/cmn/semeval.py
@@ -11,8 +11,8 @@ class SemEvalReview(Review):
def __init__(self, id, sentences, time, author, aos): super().__init__(self, id, sentences, time, author, aos)
@staticmethod
- def load(path):
- if str(path).endswith('.xml'): return SemEvalReview._xmlloader(path)
+ def load(path, explicit=True, implicit=False):
+ if str(path).endswith('.xml'): return SemEvalReview._xmlloader(path, explicit, implicit)
return SemEvalReview._txtloader(input)
@staticmethod
@@ -32,11 +32,11 @@ def _txtloader(path):
return reviews
@staticmethod
- def _xmlloader(path):
+ def _xmlloader(path, explicit, implicit):
reviews_list = []
xtree = et.parse(path).getroot()
- if xtree.tag == 'Reviews': reviews = [SemEvalReview._parse(xsentence) for xreview in tqdm(xtree) for xsentences in xreview for xsentence in xsentences]
- if xtree.tag == 'sentences': reviews = [SemEvalReview._parse(xsentence) for xsentence in tqdm(xtree)]
+ if xtree.tag == 'Reviews': reviews = [SemEvalReview._parse(xsentence, explicit, implicit) for xreview in tqdm(xtree) for xsentences in xreview for xsentence in xsentences]
+ if xtree.tag == 'sentences': reviews = [SemEvalReview._parse(xsentence, explicit, implicit) for xsentence in tqdm(xtree)]
return [r for r in reviews if r]
@@ -56,7 +56,7 @@ def _map_idx(aspect, text):
return [i for i in range(len(text_tokens), len(text_tokens) + len(aspect_tokens))]
@staticmethod
- def _parse(xsentence):
+ def _parse(xsentence, explicit, implicit):
id = xsentence.attrib["id"]
aos = []; aos_cats = []
for element in xsentence:
@@ -64,7 +64,9 @@ def _parse(xsentence):
elif element.tag == 'Opinions':#semeval-15-16
#
for opinion in element:
- if opinion.attrib["target"] == 'NULL': continue
+ # Load implicit, explicit, or both aspects
+ if not implicit and opinion.attrib["target"] == 'NULL': continue
+ if not explicit and opinion.attrib["target"] != 'NULL': continue
# we may have duplicates for the same aspect due to being in different category like in semeval 2016's
aspect = (opinion.attrib["target"], int(opinion.attrib["from"]), int(opinion.attrib["to"])) #('place', 5, 10)
# we need to map char index to token index in aspect
@@ -78,7 +80,9 @@ def _parse(xsentence):
elif element.tag == 'aspectTerms':#semeval-14
#
for opinion in element:
- if opinion.attrib["term"] == 'NULL': continue
+ # Load implicit, explicit, or both aspects
+ if not implicit and opinion.attrib["term"] == 'NULL': continue
+ if not explicit and opinion.attrib["term"] != 'NULL': continue
# we may have duplicates for the same aspect due to being in different category like in semeval 2016's
aspect = (opinion.attrib["term"], int(opinion.attrib["from"]), int(opinion.attrib["to"])) #('place', 5, 10)
# we need to map char index to token index in aspect
@@ -93,15 +97,24 @@ def _parse(xsentence):
#
aos_cats.append(opinion.attrib["category"])
+ # Mark all aos with implicit aspects
+ implicit_arr = [False] * len(aos)
+ if implicit:
+ for i, (idxlist, o, s, aspect_token) in enumerate(aos):
+ if aspect_token == 'NULL': implicit_arr[i] = True
+
#sentence = nlp(sentence) # as it does some processing, it destroys the token idx for aspect term
tokens = sentence.split()
# to fix ",a b c," to "a b c"
# to fix '"sales" team' to 'sales team' => semeval-14-labptop-
# todo: fix 'Food-awesome.' to 'food awesome' => semeval-14-restaurant-
for i, (idxlist, o, s, aspect_token) in enumerate(aos):
- for j, idx in enumerate(idxlist): tokens[idx] = aspect_token.split()[j].replace('"', '')
- aos[i] = (idxlist, o, s)
+ for j, idx in enumerate(idxlist):
+ if not implicit_arr[i]:
+ tokens[idx] = aspect_token.split()[j].replace('"', '')
+ aos[i] = (idxlist, o, s)
+
return Review(id=id, sentences=[[str(t).lower() for t in tokens]], time=None, author=None,
aos=[aos], lempos=None,
- parent=None, lang='eng_Latn', category=aos_cats) if aos else None
+ parent=None, lang='eng_Latn', category=aos_cats, implicit=implicit_arr) if aos else None
diff --git a/tests/ev_implicit_reviews.py b/tests/ev_implicit_reviews.py
new file mode 100644
index 0000000..54fbe86
--- /dev/null
+++ b/tests/ev_implicit_reviews.py
@@ -0,0 +1,86 @@
+"""Expected values for test cases of implicit/explicit review loading"""
+
+# Expected first, last, and amount of reviews with implicit aspects"""
+SEMEVAL_IMPLICIT = {
+ "first": {
+ 'id': '1004293:2',
+ 'text': 'they never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.',
+ 'sentences': [
+ ['they', 'never', 'brought', 'us', 'complimentary', 'noodles,', 'ignored', 'repeated', 'requests', 'for', 'sugar,', 'and', 'threw', 'our', 'dishes', 'on', 'the', 'table.']
+ ],
+ 'aos': [[([None], [], '-1')]],
+ 'lang': 'eng_Latn',
+ 'orig': True
+ },
+ "last": {
+ 'id': '1058221:7',
+ 'text': 'the last time i walked by it looked pretty empty. hmmm.',
+ 'sentences': [
+ ['the', 'last', 'time', 'i', 'walked', 'by', 'it', 'looked', 'pretty', 'empty.', 'hmmm.']
+ ],
+ 'aos': [[([None], [], '-1')]],
+ 'lang': 'eng_Latn',
+ 'orig': True
+ },
+ "count": 10,
+}
+
+# Expected first, last, and amount of reviews with explicit aspects
+SEMEVAL_EXPLICIT = {
+ "first": {
+ 'id': '1004293:0',
+ 'text': 'judging from previous posts of test this used to be a good place but not any longer.',
+ 'sentences': [
+ ['judging', 'from', 'previous', 'posts', 'of', 'test', 'this', 'used', 'to', 'be', 'a', 'good', 'place', 'but', 'not', 'any', 'longer.']
+ ],
+ 'aos': [
+ [(['posts', 'of', 'test'], [], '-1'),
+ (['place'], [], '-1')]
+ ],
+ 'lang': 'eng_Latn',
+ 'orig': True
+ },
+ "last": {
+ 'id': '1058221:4',
+ 'text': 'i happen to have a policy that goes along with a little bit of self-respect, which includes not letting a waiter intimidate me, i.e. make me feel bad asking for trivialities like water, or the check.',
+ 'sentences': [
+ ['i', 'happen', 'to', 'have', 'a', 'policy', 'that', 'goes', 'along', 'with', 'a', 'little', 'bit', 'of', 'self-respect,', 'which', 'includes', 'not', 'letting', 'a', 'waiter', 'intimidate', 'me,', 'i.e.', 'make', 'me', 'feel', 'bad', 'asking', 'for', 'trivialities', 'like', 'water,', 'or', 'the', 'check.']
+ ],
+ 'aos': [[(['waiter'], [], '-1')]],
+ 'lang': 'eng_Latn',
+ 'orig': True
+ },
+ "count": 18,
+}
+
+# Expected first, last, and amount of reviews with implicit and explicit aspects"""
+SEMEVAL_BOTH = {
+ "first": {
+ 'id': '1004293:0',
+ 'text': 'judging from previous posts of test this used to be a good place but not any longer.',
+ 'sentences': [
+ ['judging', 'from', 'previous', 'posts', 'of', 'test', 'this', 'used', 'to', 'be', 'a', 'good', 'place', 'but', 'not', 'any', 'longer.']
+ ],
+ 'aos': [
+ [(['posts', 'of', 'test'], [], '-1'),
+ (['place'], [], '-1')]
+ ],
+ 'lang': 'eng_Latn',
+ 'orig': True
+ },
+ "last": {
+ 'id': '1058221:7',
+ 'text': 'the last time i walked by it looked pretty empty. hmmm.',
+ 'sentences': [
+ ['the', 'last', 'time', 'i', 'walked', 'by', 'it', 'looked', 'pretty', 'empty.', 'hmmm.']
+ ],
+ 'aos': [[([None], [], '-1')]],
+ 'lang': 'eng_Latn',
+ 'orig': True
+ },
+ "count": 26,
+}
+
+SEMEVAL_NULL = {
+ "count": 0,
+}
diff --git a/tests/test_implicit_reviews.py b/tests/test_implicit_reviews.py
new file mode 100644
index 0000000..9f7ec5a
--- /dev/null
+++ b/tests/test_implicit_reviews.py
@@ -0,0 +1,60 @@
+"""Unit tests for implicit review loading"""
+import sys
+import os
+import pytest
+from ev_implicit_reviews import SEMEVAL_EXPLICIT, SEMEVAL_IMPLICIT, SEMEVAL_BOTH, SEMEVAL_NULL
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../src')))
+
+from cmn.review import Review
+from cmn.semeval import SemEvalReview
+from cmn.twitter import TwitterReview # Currently, no twitter reviews have implicit aspects
+
+SEMEVAL_PATH = "./data/raw/semeval/toy.2016SB5/ABSA16_Restaurants_Train_SB1_v2.xml"
+
+@pytest.mark.parametrize("path, expected", [
+ (SEMEVAL_PATH, SEMEVAL_IMPLICIT),
+])
+def test_implicit(path, expected):
+ """Test loading implicit aspect containing reviews."""
+ reviews: list[Review] = SemEvalReview.load(path, explicit=False, implicit=True)
+ first = reviews[0].to_dict()[0]
+ last = reviews[-1].to_dict()[0]
+ count = len(reviews)
+ assert first == expected["first"]
+ assert last == expected["last"]
+ assert count == expected["count"]
+
+@pytest.mark.parametrize("path, expected", [
+ (SEMEVAL_PATH, SEMEVAL_EXPLICIT),
+])
+def test_explicit(path, expected):
+ """Test loading explicit aspect containing reviews."""
+ reviews: list[Review] = SemEvalReview.load(path, explicit=True, implicit=False)
+ first = reviews[0].to_dict()[0]
+ last = reviews[-1].to_dict()[0]
+ count = len(reviews)
+ assert first == expected["first"]
+ assert last == expected["last"]
+ assert count == expected["count"]
+
+@pytest.mark.parametrize("path, expected", [
+ (SEMEVAL_PATH, SEMEVAL_BOTH),
+])
+def test_implicit_and_explicit(path, expected):
+ """Test loading both implicit and explicit reviews."""
+ reviews: list[Review] = SemEvalReview.load(path, explicit=True, implicit=True)
+ first = reviews[0].to_dict()[0]
+ last = reviews[-1].to_dict()[0]
+ count = len(reviews)
+ assert first == expected["first"]
+ assert last == expected["last"]
+ assert count == expected["count"]
+
+@pytest.mark.parametrize("path, expected", [
+ (SEMEVAL_PATH, SEMEVAL_NULL),
+])
+def test_null(path, expected):
+ """Test loading neither implicit nor explicit reviews."""
+ reviews: list[Review] = SemEvalReview.load(path, explicit=False, implicit=False)
+ count = len(reviews)
+ assert count == expected["count"]