diff --git a/src/aml/bert.py b/src/aml/bert.py index ba38e37..942f0fb 100644 --- a/src/aml/bert.py +++ b/src/aml/bert.py @@ -221,4 +221,7 @@ def infer_batch(self, reviews_test, h_ratio, doctype, output): def infer_batch_sentiment(self, reviews_test: List[Review], h_ratio: int, doctype: str, output: str): _, sentiment_pairs = self.get_pairs_and_test(reviews_test, h_ratio, doctype, output) - return sentiment_pairs \ No newline at end of file + return sentiment_pairs + + def train_sentiment(self, reviews_train, reviews_valid, settings, doctype, no_extremes, output) -> None: + self.train(reviews_train, reviews_valid, settings, doctype, no_extremes, output) \ No newline at end of file diff --git a/src/aml/mdl.py b/src/aml/mdl.py index 402756f..09946ae 100644 --- a/src/aml/mdl.py +++ b/src/aml/mdl.py @@ -37,11 +37,6 @@ def __init__(self, naspects: int, nwords: int, capabilities: ModelCapabilities=[ def name(self) -> str: return self.__class__.__name__.lower() def load(self, path): pass - def train(self, reviews_train, reviews_valid, settings, doctype, no_extremes, output) -> None: - corpus, self.dict = _AbstractReviewAnalysisModel.preprocess(doctype, reviews_train, no_extremes) - self.dict.save(f'{output}model.dict') - pd.to_pickle(self.cas, f'{output}model.perf.cas') - pd.to_pickle(self.perplexity, f'{output}model.perf.perplexity') def quality(self, metric: Metrics): result = QualityType(coherence=f'{np.mean(self.cas)}\u00B1{np.std(self.cas)}', perplexity=self.perplexity) @@ -49,8 +44,6 @@ def quality(self, metric: Metrics): # elif metric is "perplexity": # return - def infer(self, review: Review, doctype: str) -> List[List[AspectPairType]]: pass # type: ignore - @staticmethod def preprocess(doctype, reviews, settings=None): if not _AbstractReviewAnalysisModel.stop_words: @@ -86,6 +79,7 @@ def __init__(self, naspects: int, nwords: int, capabilities: ModelCapabilities=[ self.naspects = naspects self.nwords = nwords + def infer(self, review: Review, doctype: str) -> List[List[AspectPairType]]: pass # type: ignore def infer_batch(self, reviews_test: List[Review], h_ratio: int, doctype: str, output: str) -> BatchPairsType: pairs: BatchPairsType = [] @@ -103,6 +97,12 @@ def infer_batch(self, reviews_test: List[Review], h_ratio: int, doctype: str, ou return pairs + def train(self, reviews_train, reviews_valid, settings, doctype, no_extremes, output) -> None: + corpus, self.dict = _AbstractReviewAnalysisModel.preprocess(doctype, reviews_train, no_extremes) + self.dict.save(f'{output}model.dict') + pd.to_pickle(self.cas, f'{output}model.perf.cas') + pd.to_pickle(self.perplexity, f'{output}model.perf.perplexity') + def get_aspects_words(self, nwords): pass def get_aspect_words(self, aspect_id: Aspect, nwords: int) -> List[AspectPairType]: pass # type: ignore def merge_aspects_words(self, r_pred_aspects: List[List[AspectPairType]], nwords: int) -> List[List[AspectPairType]]: @@ -125,6 +125,13 @@ def __init__(self, naspects: int, nwords: int, capabilities: ModelCapabilities=[ self.naspects = naspects self.nwords = nwords + def train_sentiment(self, reviews_train, reviews_valid, settings, doctype, no_extremes, output) -> None: + corpus, self.dict = _AbstractReviewAnalysisModel.preprocess(doctype, reviews_train, no_extremes) + self.dict.save(f'{output}model.dict') + pd.to_pickle(self.cas, f'{output}model.perf.cas') + pd.to_pickle(self.perplexity, f'{output}model.perf.perplexity') + + def infer_sentiment(self, review: Review, doctype: str) -> List[List[AspectPairType]]: pass # type: ignore def infer_batch_sentiment(self, reviews_test: List[Review], h_ratio: int, doctype: str, output: str) -> BatchPairsType: pairs: BatchPairsType = [] @@ -135,7 +142,7 @@ def infer_batch_sentiment(self, reviews_test: List[Review], h_ratio: int, doctyp if random.random() < h_ratio: r_ = r.hide_aspects() else: r_ = r - r_pred_aspects = self.infer(r_, doctype) + r_pred_aspects = self.infer_sentiment(r_, doctype) # removing duplicate aspect words ==> handled in metrics() pairs.extend(list(zip(r_aspect_ids, r_pred_aspects))) diff --git a/src/main.py b/src/main.py index f458c70..b53517b 100644 --- a/src/main.py +++ b/src/main.py @@ -105,7 +105,7 @@ def default(self, obj): with open(f'{output}/splits.json', 'w') as f: json.dump(splits, f, cls=NumpyArrayEncoder, indent=1) return splits -def train(args, am, train, valid, f, output): +def train(args, am, train, valid, f, output, capability: ModelCapability): print(f'\n2. Aspect model training for {am.name()} ...') print('#' * 50) try: @@ -114,7 +114,7 @@ def train(args, am, train, valid, f, output): except (FileNotFoundError, EOFError) as _: print(f'2.1. Loading saved aspect model failed! Training {am.name()} for {args.naspects} of aspects. See {output}/f{f}.model.train.log for training logs ...') if not os.path.isdir(output): os.makedirs(output) - am.train(train, valid, params.settings['train'][am.name()], params.settings['prep']['doctype'], params.settings['train']['no_extremes'], f'{output}/f{f}.') + get_model_train_method(am, capability)(train, valid, params.settings['train'][am.name()], params.settings['prep']['doctype'], params.settings['train']['no_extremes'], f'{output}/f{f}.') # from aml.mdl import AbstractAspectModel print('2.2. Quality of aspects ...') @@ -145,6 +145,14 @@ def get_model_infer_method(am: Union[AbstractSentimentModel, AbstractAspectModel raise Exception(f'Not handled model: {am.name()}') +def get_model_train_method(am: Union[AbstractSentimentModel, AbstractAspectModel], model_capability: ModelCapability): + if isinstance(am, AbstractAspectModel) and model_capability == 'aspect_detection': + return am.train + elif isinstance(am, AbstractSentimentModel) and model_capability == 'sentiment_analysis': + return am.train_sentiment + + raise Exception(f'Not handled model: {am.name()}') + def get_model_metrics(model_capability: ModelCapability) -> Set[str]: return pampy.match(model_capability, @@ -248,14 +256,16 @@ def main(args): output = f'{output}/{am.name()}/' if 'train' in params.settings['cmd']: - for f in splits['folds'].keys(): - t_s = time.time() - reviews_train = np.array(reviews)[splits['folds'][f]['train']].tolist() - reviews_train.extend([r_.augs[lang][1] for r_ in reviews_train for lang in params.settings['prep']['langaug'] if lang and r_.augs[lang][2] >= params.settings['train']['langaug_semsim']]) - train(args, am, reviews_train, np.array(reviews)[splits['folds'][f]['valid']].tolist(), f, output) - print(f'Trained time elapsed including language augs {params.settings["prep"]["langaug"]}: {time.time() - t_s}') + for capability in train_for: + for f in splits['folds'].keys(): + t_s = time.time() + reviews_train = np.array(reviews)[splits['folds'][f]['train']].tolist() + reviews_train.extend([r_.augs[lang][1] for r_ in reviews_train for lang in params.settings['prep']['langaug'] if lang and r_.augs[lang][2] >= params.settings['train']['langaug_semsim']]) + train(args, am, reviews_train, np.array(reviews)[splits['folds'][f]['valid']].tolist(), f, output, capability) + print(f'Trained time elapsed including language augs {params.settings["prep"]["langaug"]}: {time.time() - t_s}') eval_for: ModelCapabilities = set(params.settings['eval']['for']).intersection(am.capabilities) #type: ignore + train_for: ModelCapabilities = set(params.settings['train']['for']).intersection(am.capabilities) #type: ignore # testing if 'test' in params.settings['cmd']: diff --git a/src/params.py b/src/params.py index 59c4a1b..4a7f3c1 100644 --- a/src/params.py +++ b/src/params.py @@ -22,6 +22,7 @@ def to_range(range_str): return range(int(range_str.split(':')[0]), int(range_st 'batch': True, }, 'train': { + 'for': ['aspect_detectoin', 'sentiment_analysis'], 'ratio': 0.85, # 1 - ratio goes to test. To train on entire dataset: 0.999 and 'nfolds': 0 'nfolds': 5, # on the train, nfold x-valid, 0: no x-valid only test and train, 1: test, 1-fold 'langaug_semsim': 0.5, # backtranslated review is in training if its semantic similarity with original review is >= this value