Skip to content

Commit

Permalink
✨ Add train sentiment
Browse files Browse the repository at this point in the history
  • Loading branch information
3ripleM committed Nov 28, 2023
1 parent 6276f73 commit 061a3f0
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 17 deletions.
5 changes: 4 additions & 1 deletion src/aml/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,4 +221,7 @@ def infer_batch(self, reviews_test, h_ratio, doctype, output):
def infer_batch_sentiment(self, reviews_test: List[Review], h_ratio: int, doctype: str, output: str):
_, sentiment_pairs = self.get_pairs_and_test(reviews_test, h_ratio, doctype, output)

return sentiment_pairs
return sentiment_pairs

def train_sentiment(self, reviews_train, reviews_valid, settings, doctype, no_extremes, output) -> None:
self.train(reviews_train, reviews_valid, settings, doctype, no_extremes, output)
23 changes: 15 additions & 8 deletions src/aml/mdl.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,13 @@ def __init__(self, naspects: int, nwords: int, capabilities: ModelCapabilities=[

def name(self) -> str: return self.__class__.__name__.lower()
def load(self, path): pass
def train(self, reviews_train, reviews_valid, settings, doctype, no_extremes, output) -> None:
corpus, self.dict = _AbstractReviewAnalysisModel.preprocess(doctype, reviews_train, no_extremes)
self.dict.save(f'{output}model.dict')
pd.to_pickle(self.cas, f'{output}model.perf.cas')
pd.to_pickle(self.perplexity, f'{output}model.perf.perplexity')

def quality(self, metric: Metrics):
result = QualityType(coherence=f'{np.mean(self.cas)}\u00B1{np.std(self.cas)}', perplexity=self.perplexity)
return result[metric]
# elif metric is "perplexity":
# return

def infer(self, review: Review, doctype: str) -> List[List[AspectPairType]]: pass # type: ignore

@staticmethod
def preprocess(doctype, reviews, settings=None):
if not _AbstractReviewAnalysisModel.stop_words:
Expand Down Expand Up @@ -86,6 +79,7 @@ def __init__(self, naspects: int, nwords: int, capabilities: ModelCapabilities=[
self.naspects = naspects
self.nwords = nwords

def infer(self, review: Review, doctype: str) -> List[List[AspectPairType]]: pass # type: ignore
def infer_batch(self, reviews_test: List[Review], h_ratio: int, doctype: str, output: str) -> BatchPairsType:
pairs: BatchPairsType = []

Expand All @@ -103,6 +97,12 @@ def infer_batch(self, reviews_test: List[Review], h_ratio: int, doctype: str, ou

return pairs

def train(self, reviews_train, reviews_valid, settings, doctype, no_extremes, output) -> None:
corpus, self.dict = _AbstractReviewAnalysisModel.preprocess(doctype, reviews_train, no_extremes)
self.dict.save(f'{output}model.dict')
pd.to_pickle(self.cas, f'{output}model.perf.cas')
pd.to_pickle(self.perplexity, f'{output}model.perf.perplexity')

def get_aspects_words(self, nwords): pass
def get_aspect_words(self, aspect_id: Aspect, nwords: int) -> List[AspectPairType]: pass # type: ignore
def merge_aspects_words(self, r_pred_aspects: List[List[AspectPairType]], nwords: int) -> List[List[AspectPairType]]:
Expand All @@ -125,6 +125,13 @@ def __init__(self, naspects: int, nwords: int, capabilities: ModelCapabilities=[
self.naspects = naspects
self.nwords = nwords

def train_sentiment(self, reviews_train, reviews_valid, settings, doctype, no_extremes, output) -> None:
corpus, self.dict = _AbstractReviewAnalysisModel.preprocess(doctype, reviews_train, no_extremes)
self.dict.save(f'{output}model.dict')
pd.to_pickle(self.cas, f'{output}model.perf.cas')
pd.to_pickle(self.perplexity, f'{output}model.perf.perplexity')

def infer_sentiment(self, review: Review, doctype: str) -> List[List[AspectPairType]]: pass # type: ignore
def infer_batch_sentiment(self, reviews_test: List[Review], h_ratio: int, doctype: str, output: str) -> BatchPairsType:
pairs: BatchPairsType = []

Expand All @@ -135,7 +142,7 @@ def infer_batch_sentiment(self, reviews_test: List[Review], h_ratio: int, doctyp
if random.random() < h_ratio: r_ = r.hide_aspects()
else: r_ = r

r_pred_aspects = self.infer(r_, doctype)
r_pred_aspects = self.infer_sentiment(r_, doctype)
# removing duplicate aspect words ==> handled in metrics()

pairs.extend(list(zip(r_aspect_ids, r_pred_aspects)))
Expand Down
26 changes: 18 additions & 8 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def default(self, obj):
with open(f'{output}/splits.json', 'w') as f: json.dump(splits, f, cls=NumpyArrayEncoder, indent=1)
return splits

def train(args, am, train, valid, f, output):
def train(args, am, train, valid, f, output, capability: ModelCapability):
print(f'\n2. Aspect model training for {am.name()} ...')
print('#' * 50)
try:
Expand All @@ -114,7 +114,7 @@ def train(args, am, train, valid, f, output):
except (FileNotFoundError, EOFError) as _:
print(f'2.1. Loading saved aspect model failed! Training {am.name()} for {args.naspects} of aspects. See {output}/f{f}.model.train.log for training logs ...')
if not os.path.isdir(output): os.makedirs(output)
am.train(train, valid, params.settings['train'][am.name()], params.settings['prep']['doctype'], params.settings['train']['no_extremes'], f'{output}/f{f}.')
get_model_train_method(am, capability)(train, valid, params.settings['train'][am.name()], params.settings['prep']['doctype'], params.settings['train']['no_extremes'], f'{output}/f{f}.')

# from aml.mdl import AbstractAspectModel
print('2.2. Quality of aspects ...')
Expand Down Expand Up @@ -145,6 +145,14 @@ def get_model_infer_method(am: Union[AbstractSentimentModel, AbstractAspectModel

raise Exception(f'Not handled model: {am.name()}')

def get_model_train_method(am: Union[AbstractSentimentModel, AbstractAspectModel], model_capability: ModelCapability):
if isinstance(am, AbstractAspectModel) and model_capability == 'aspect_detection':
return am.train
elif isinstance(am, AbstractSentimentModel) and model_capability == 'sentiment_analysis':
return am.train_sentiment

raise Exception(f'Not handled model: {am.name()}')


def get_model_metrics(model_capability: ModelCapability) -> Set[str]:
return pampy.match(model_capability,
Expand Down Expand Up @@ -248,14 +256,16 @@ def main(args):
output = f'{output}/{am.name()}/'

if 'train' in params.settings['cmd']:
for f in splits['folds'].keys():
t_s = time.time()
reviews_train = np.array(reviews)[splits['folds'][f]['train']].tolist()
reviews_train.extend([r_.augs[lang][1] for r_ in reviews_train for lang in params.settings['prep']['langaug'] if lang and r_.augs[lang][2] >= params.settings['train']['langaug_semsim']])
train(args, am, reviews_train, np.array(reviews)[splits['folds'][f]['valid']].tolist(), f, output)
print(f'Trained time elapsed including language augs {params.settings["prep"]["langaug"]}: {time.time() - t_s}')
for capability in train_for:
for f in splits['folds'].keys():
t_s = time.time()
reviews_train = np.array(reviews)[splits['folds'][f]['train']].tolist()
reviews_train.extend([r_.augs[lang][1] for r_ in reviews_train for lang in params.settings['prep']['langaug'] if lang and r_.augs[lang][2] >= params.settings['train']['langaug_semsim']])
train(args, am, reviews_train, np.array(reviews)[splits['folds'][f]['valid']].tolist(), f, output, capability)
print(f'Trained time elapsed including language augs {params.settings["prep"]["langaug"]}: {time.time() - t_s}')

eval_for: ModelCapabilities = set(params.settings['eval']['for']).intersection(am.capabilities) #type: ignore
train_for: ModelCapabilities = set(params.settings['train']['for']).intersection(am.capabilities) #type: ignore

# testing
if 'test' in params.settings['cmd']:
Expand Down
1 change: 1 addition & 0 deletions src/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def to_range(range_str): return range(int(range_str.split(':')[0]), int(range_st
'batch': True,
},
'train': {
'for': ['aspect_detectoin', 'sentiment_analysis'],
'ratio': 0.85, # 1 - ratio goes to test. To train on entire dataset: 0.999 and 'nfolds': 0
'nfolds': 5, # on the train, nfold x-valid, 0: no x-valid only test and train, 1: test, 1-fold
'langaug_semsim': 0.5, # backtranslated review is in training if its semantic similarity with original review is >= this value
Expand Down

0 comments on commit 061a3f0

Please sign in to comment.