Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Imperceptibility Improvement Methods #20

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion baselines/run_GAReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,6 @@ def main(config, model_filename):

if __name__ == "__main__":


model_name = "GAReader"
data_dir = "./data/imperceptibility/training_data"
embedding_folder = "./baselines/embeddings/" ##
Expand All @@ -262,6 +261,7 @@ def main(config, model_filename):

if model_name == "GAReader":
from baselines.GAReader import args, GAReader

main(
args.get_args(data_dir, cache_dir, embedding_folder, output_dir, log_dir),
model_filename,
Expand Down
5 changes: 4 additions & 1 deletion baselines/utils/arc_embedding_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ def load_data(
)

print(
"the size of train: {}, dev:{},".format(len(train.examples), len(dev.examples),)
"the size of train: {}, dev:{},".format(
len(train.examples),
len(dev.examples),
)
)

word_field.build_vocab(
Expand Down
4 changes: 3 additions & 1 deletion eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@


validation_dataset = ConcretenessDataset(
file_path=val_file_path, tokenizer=tokenizer, split="val",
file_path=val_file_path,
tokenizer=tokenizer,
split="val",
)
val_loader = DataLoader(validation_dataset, batch_size=1, shuffle=False)
embeddings = GloveEmbedding(
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from src.datasets.cloze_dataset import *
from src.datasets.max_cloze_dataset import *
from src.datasets.max_cloze_dataset import *
183 changes: 183 additions & 0 deletions src/improvement_methods/improve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import json
import argparse
from src.improvement_methods import StatisticalEmbedding
from src.datasets.cloze_dataset import ClozeDataset
from transformers import AutoTokenizer
from src.models import *
from src.utils.configuration import Config
import copy
from torch.utils.data import DataLoader
import torch
import heapq
import numpy as np

parser = argparse.ArgumentParser(
prog="improve.py",
description="Apply improvement approach to imperceptibility methods",
)

parser.add_argument(
"--model",
type=str,
action="store",
help="The configuration for model",
default=os.path.join(dirname, "./configs/models/forty/default.yaml"),
)
parser.add_argument(
"--data",
type=str,
action="store",
help="The configuration for data",
default=os.path.join(dirname, "./configs/datasets/forty/default.yaml"),
)
parser.add_argument(
"--trained_model_path",
type=str,
help="Path of the trained model's path",
default="/content/drive/MyDrive/SemEval/SemEval_final/distilbert_train_trial/ReCAM-final/ckpts_old/all_ckpts/3_5600.pth",
)
parser.add_argument(
"--test_configuration",
help="Whether test data is being used.",
type=str,
default=False,
)
parser.add_argument(
"--improvement_method",
help="Select between: Thresholding Method(threshold), Difference Method(difference), Second Highest Probability Method(second_highest)",
type=str,
default="threshold",
)
dataset_path = Config(path=args.data)
model_config = Config(path=args.model)
path = args.trained_model_path
test_flag = args.test_configuration
emb = StatisticalEmbedding(normalise=False)


def generate_cloze_predictions(dataset_path, generate_hyponyms=False):
with open(dataset_path) as f:
datapoints = [json.loads(datapoint) for datapoint in f.read().splitlines()]
model_name = model_config.params["pretrained_model_name_or_path"]
tokenizer = AutoTokenizer.from_pretrained(model_name)
cloze_dataset = ClozeDataset(dataset_config, tokenizer)
weight = torch.load(path)
model = torch.load(path)["model_state_dict"]
dataloader = DataLoader(
cloze_dataset,
collate_fn=cloze_dataset.custom_collate_fn,
batch_size=1,
shuffle=False,
)
if torch.cuda.is_available():
model.cuda()

with torch.no_grad():
model.eval()
for batch in dataloader:
*inputs, label = [torch.tensor(value, device="cuda") for value in batch]
datapoint = datapoints[i]
if hyponym_run:
cloze_prediction, bert_output = model(inputs)
cloze_prediction_label = torch.argmax(cloze_prediction)
final_datapoints.append(
{
"cloze_prediction": [
float(i) for i in cloze_prediction.cpu().numpy()[0]
],
"cloze_prediction_label": int(
cloze_prediction_label.cpu().numpy()
),
"datapoint": datapoint,
"bert_output": bert_output.detach().cpu().tolist(),
}
)
else:
cloze_prediction = model(inputs)
cloze_prediction_label = torch.argmax(cloze_prediction)
final_datapoints.append(
{
"cloze_prediction": [
float(i) for i in cloze_prediction.cpu().numpy()[0]
],
"cloze_prediction_label": int(
cloze_prediction_label.cpu().numpy()
),
"datapoint": datapoint,
}
)
if test_flag:
final_datapoints[-1]["id"] = datapoint["id"]
return final_datapoints


def improvement_methods(p_value, final_datapoints, method="threshold"):
lst_res = []
softmax_function = torch.nn.Softmax()

for i, data in enumerate(final_datapoints):
cloze_preds = torch.Tensor(data["cloze_prediction"])
cloze_probs = softmax_function(cloze_preds)
if method == "threshold":
pred_label = int(torch.argmax(cloze_probs))
if cloze_probs[pred_label] < p_value:
lst_inds = heapq.nlargest(
3, range(len(cloze_preds)), key=cloze_preds.__getitem__
)
second_max = lst_inds[1]
second_max_option = data["datapoint"]["option_" + str(second_max)]
max_option = data["datapoint"]["option_" + str(lst_inds[0])]
max_opt_emb = np.array(emb.get_embedding(max_option))
sec_opt_emb = np.array(emb.get_embedding(second_max_option))
no_of_unequals = (
sec_opt_emb.shape[0] - (sec_opt_emb == max_opt_emb).sum()
)
if (sec_opt_emb > max_opt_emb).sum() >= no_of_unequals / 2:
final_datapoints[i]["cloze_prediction_label"] = second_max_option

elif method == "difference":
lst_inds = heapq.nlargest(
3, range(len(cloze_preds)), key=cloze_preds.__getitem__
)
first_max = lst_inds[0]
second_max = lst_inds[1]
second_max_option = data["datapoint"]["option_" + str(second_max)]
max_option = data["datapoint"]["option_" + str(first_max)]
if cloze_probs[first_max] - cloze_probs[second_max] < p_value:
max_opt_emb = np.array(emb.get_embedding(max_option))
sec_opt_emb = np.array(emb.get_embedding(second_max_option))
no_of_unequals = (
sec_opt_emb.shape[0] - (sec_opt_emb == max_opt_emb).sum()
)
if (sec_opt_emb > max_opt_emb).sum() >= no_of_unequals / 2:
final_datapoints[i]["cloze_prediction_label"] = second_max_option

elif method == "second_highest":
lst_inds = heapq.nlargest(
3, range(len(cloze_preds)), key=cloze_preds.__getitem__
)
first_max = lst_inds[0]
second_max = lst_inds[1]
second_max_option = data["datapoint"]["option_" + str(second_max)]
max_option = data["datapoint"]["option_" + str(first_max)]
if cloze_probs[second_max] > p_value:
max_opt_emb = np.array(emb.get_embedding(max_option))
sec_opt_emb = np.array(emb.get_embedding(second_max_option))
no_of_unequals = (
sec_opt_emb.shape[0] - (sec_opt_emb == max_opt_emb).sum()
)
if (sec_opt_emb > max_opt_emb).sum() >= no_of_unequals / 2:
final_datapoints[i]["cloze_prediction_label"] = second_max_option
return final_datapoints


def write_to_csv(final_datapoints):
output = ""
for i, data in enumerate(final_datapoints):
if test_flag:
id = data["id"]
else:
id = i
output += id + "," + int(data["cloze_prediction_label"]) + "\n"
with open("output.csv", "w") as f:
f.write(output)
161 changes: 161 additions & 0 deletions src/improvement_methods/statistical_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import glob
import nltk
from tqdm.auto import tqdm
from numpy.linalg import norm
import spacy
from nltk.corpus import sentiwordnet as swn
from itertools import chain
from nltk.corpus import wordnet as wn

nltk.download("punkt")
nlp = spacy.load("en", disable=["parser", "ner"])
nltk.download("sentiwordnet")
nltk.download("wordnet")


class StatisticalEmbedding:
def __init__(self, normalise=True):
# add word frequency later
# try to fix number of senses and add it later
# try to fix number of hyponyms and add it later
self.normalise = normalise

def get_embedding(self, word):
len_embedding = self.get_length_of_word(word)
sense_embedding = self.get_number_of_senses(word)
hyponym_embedding = self.get_no_of_hyponyms(word)
avg_hyponym_embedding = self.get_avg_no_of_hyponyms(word)
depth_hypernymy_embedding = self.get_depth_of_hypernymy_tree(word)
avg_depth_hypernymy_embedding = self.get_avg_depth_of_hypernymy_tree(word)
pos_neg_obj_score = self.get_pos_neg_obj_scores(word)
avg_pos_neg_obj_score = self.get_avg_pos_neg_obj_scores(word)

embedding = [
len_embedding,
sense_embedding,
hyponym_embedding,
avg_hyponym_embedding,
depth_hypernymy_embedding,
avg_depth_hypernymy_embedding,
pos_neg_obj_score[0],
pos_neg_obj_score[1],
pos_neg_obj_score[2],
avg_pos_neg_obj_score[0],
avg_pos_neg_obj_score[1],
avg_pos_neg_obj_score[2],
]
if self.normalise:
embedding = embedding / norm(embedding)
return embedding

def get_length_of_word(self, word):
words = word.split(" ")
lengths = [len(word) for word in words]
max_len = max(lengths)
return max_len

def get_number_of_senses(self, word):
# words = word.split(' ')
# lst_of_senses = [len(wn.synsets(word)) for word in words]
# max_no_of_senses = max(lst_of_senses)
return len(wn.synsets(word))

def get_depth_of_hypernymy_tree(self, word):
max_len_paths = 0
words = word.split(" ")
for word_n in words:
if len(wn.synsets(word_n)) > 0:
j = wn.synsets(word_n)[0]
paths_to_top = j.hypernym_paths()
max_len_paths = max(
max_len_paths, len(max(paths_to_top, key=lambda i: len(i)))
)

return 100000 - max_len_paths

def get_avg_depth_of_hypernymy_tree(self, word):
words = word.split(" ")
lst_avg_len_paths = []
for word_n in words:
i = 0
avg_len_paths = 0

for j in wn.synsets(word_n):
paths_to_top = j.hypernym_paths()
max_len_path = len(max(paths_to_top, key=lambda k: len(k)))
avg_len_paths += max_len_path
i += 1
if i > 0:
return 100000 - avg_len_paths / i
else:
return 100000

def get_pos_neg_obj_scores(self, word):
words = word.split(" ")
pos_scores = []
neg_scores = []
obj_scores = []

for word_n in words:

if len(list(swn.senti_synsets(word_n))) > 0:
j = list(swn.senti_synsets(word_n))[0]

pos_scores.append(j.pos_score())
neg_scores.append(j.neg_score())
obj_scores.append(j.obj_score())
else:
pos_scores.append(0)
neg_scores.append(0)
obj_scores.append(0)
return (max(pos_scores), max(neg_scores), 1 - max(obj_scores))

def get_avg_pos_neg_obj_scores(self, word):
words = word.split(" ")
pos_scores = []
neg_scores = []
obj_scores = []

for word_n in words:
ct = 0
avg_pos_score = 0
avg_neg_score = 0
avg_obj_score = 0

for j in list(swn.senti_synsets(word_n)):
avg_pos_score += j.pos_score()
avg_neg_score += j.neg_score()
avg_obj_score += j.obj_score()
ct += 1

if ct > 0:
pos_scores.append(avg_pos_score / ct)
neg_scores.append(avg_neg_score / ct)
obj_scores.append(avg_obj_score / ct)
else:
pos_scores.append(0)
neg_scores.append(0)
obj_scores.append(0)
return (max(pos_scores), max(neg_scores), 1 - max(obj_scores))

def get_no_of_hyponyms(self, word):

if len(wn.synsets(word)) > 0:
j = wn.synsets(word)[0]
# print(word)
# print(j.hyponyms())
no_of_hypos = len(list(chain(*[l.lemma_names() for l in j.hyponyms()])))
return no_of_hypos
else:
return 0

def get_avg_no_of_hyponyms(self, word):
i = 0
no_of_hypos = 0
for j in wn.synsets(word):
no_of_hypos += len(list(chain(*[l.lemma_names() for l in j.hyponyms()])))
i += 1
if i > 0:
return no_of_hypos / i
else:
return 0
Loading