-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
Evaluation
- Loading branch information
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
import json | ||
import os | ||
from datetime import datetime | ||
from sklearn.metrics import classification_report | ||
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer | ||
import re | ||
|
||
class Evaluator: | ||
def __init__(self, test_data_folder='../data'): | ||
self.test_data_folder = test_data_folder | ||
self.submission_folder = '../data/participant_submissions' | ||
|
||
def load_data(self, filepath): | ||
with open(filepath, 'r', encoding='utf-8') as f: | ||
return [json.loads(line) for line in f] | ||
|
||
def save_report(self, report, task_number, prediction_file_path, evaluationtype=""): | ||
base_filename = os.path.basename(prediction_file_path) | ||
base_filename = os.path.splitext(base_filename)[0] | ||
report_filename = f"../results/task{task_number}/{base_filename}_task{task_number}_report_{evaluationtype}.txt" | ||
with open(report_filename, 'w') as f: | ||
f.write(report) | ||
print(f"Report saved to {report_filename}") | ||
|
||
def match_data_by_id(self, predictions, true_data): | ||
true_data_dict = {item['ID']: item for item in true_data} | ||
matched_predictions = [] | ||
matched_true_data = [] | ||
for pred in predictions: | ||
if pred['ID'] in true_data_dict: | ||
matched_predictions.append(pred) | ||
matched_true_data.append(true_data_dict[pred['ID']]) | ||
return matched_predictions, matched_true_data | ||
|
||
def task_1_goldlabel(self, predictions, true_data): | ||
pred_labels = [item['SDG'] for item in predictions] | ||
true_labels = [item['SDG'] for item in true_data] | ||
report = classification_report(true_labels, pred_labels, output_dict=False, zero_division=0) | ||
print("Task 1 Goldlabel Evaluation:\n", report) | ||
return report | ||
|
||
def task_1_secondary(self, predictions, true_data): | ||
for i, item in enumerate(predictions): | ||
if item['SDG'] in true_data[i]['SDGS']: | ||
true_data[i]['SDG'] = item['SDG'] | ||
pred_labels = [item['SDG'] for item in predictions] | ||
true_labels = [item['SDG'] for item in true_data] | ||
report = classification_report(true_labels, pred_labels, output_dict=False, zero_division=0) | ||
print("Task 1 Secondary Evaluation:\n", report) | ||
return report | ||
|
||
def evaluate_task_1(self, predictions, true_data, prediction_file_path): | ||
predictions, true_data = self.match_data_by_id(predictions, true_data) | ||
report = self.task_1_goldlabel(predictions, true_data) | ||
self.save_report(report, 1, prediction_file_path, "goldlabel") | ||
report = self.task_1_secondary(predictions, true_data) | ||
self.save_report(report, 1, prediction_file_path, "secondary") | ||
|
||
def evaluate_task_2_goldlabel(self, predictions, true_data): | ||
lb = LabelBinarizer() | ||
true_main_targets = lb.fit_transform([item['TARGET'] for item in true_data]) | ||
pred_main_targets = lb.transform([item['TARGET'] for item in predictions]) | ||
main_target_report = classification_report(true_main_targets, pred_main_targets, target_names=lb.classes_, zero_division=0) | ||
print("Task 2 Goldlabel - Main Target Evaluation\n", main_target_report) | ||
return main_target_report | ||
|
||
def evaluate_task_2_secondary(self, predictions, true_data): | ||
mlb = MultiLabelBinarizer() | ||
true_secondary_targets = mlb.fit_transform([item['TARGETS'] for item in true_data]) | ||
pred_secondary_targets = mlb.transform([item['TARGETS'] for item in predictions]) | ||
secondary_target_report = classification_report(true_secondary_targets, pred_secondary_targets, target_names=mlb.classes_, zero_division=0) | ||
print("Task 2 Secondary - Secondary Targets Evaluation\n", secondary_target_report) | ||
return secondary_target_report | ||
|
||
def evaluate_task_2(self, predictions, true_data, prediction_file_path): | ||
predictions, true_data = self.match_data_by_id(predictions, true_data) | ||
# Goldlabel Evaluation | ||
main_target_report = self.evaluate_task_2_goldlabel(predictions, true_data) | ||
self.save_report(main_target_report, 2, prediction_file_path, "goldlabel") | ||
|
||
# Secondary Evaluation | ||
secondary_target_report = self.evaluate_task_2_secondary(predictions, true_data) | ||
self.save_report(secondary_target_report, 2, prediction_file_path, "secondary") | ||
|
||
def evaluate(self, task_number, prediction_file_path): | ||
original_data_path = f"{self.test_data_folder}/testdata_results.jsonl" | ||
predictions = self.load_data(prediction_file_path) | ||
true_data = self.load_data(original_data_path) | ||
|
||
if task_number == 1: | ||
self.evaluate_task_1(predictions, true_data, prediction_file_path) | ||
elif task_number == 2: | ||
self.evaluate_task_2(predictions, true_data, prediction_file_path) | ||
else: | ||
print("Invalid task number. Please specify 1 or 2.") | ||
|
||
def evaluate_all_participants(self): | ||
for folder in os.listdir(self.submission_folder): | ||
folder_path = os.path.join(self.submission_folder, folder) | ||
for file in os.listdir(folder_path): | ||
if file.endswith('.json'): | ||
file = self.convert_to_jsonl(file, folder_path) | ||
if file.endswith('.jsonl'): | ||
print(f"################## Evaluating {file} ##################") | ||
regex = re.compile(r'task\d+') | ||
full_task = regex.findall(file.lower())[0] | ||
task_number = int(full_task[-1]) | ||
self.evaluate(task_number, f"{self.submission_folder}/{folder}/{file}") | ||
|
||
def convert_to_jsonl(self, file, folder_path): | ||
with open(f"{folder_path}/{file}", 'r', encoding='utf-8') as f: | ||
data = json.load(f) | ||
with open(f"{folder_path}/{os.path.splitext(file)[0]}.jsonl", 'w', encoding='utf-8') as f: | ||
for item in data: | ||
f.write(json.dumps(item) + '\n') | ||
file = os.path.splitext(file)[0] + '.jsonl' | ||
return file | ||
|
||
if __name__ == '__main__': | ||
evaluator = Evaluator() | ||
evaluator.evaluate_all_participants() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
precision recall f1-score support | ||
|
||
0 0.71 0.29 0.41 83 | ||
1 1.00 0.67 0.80 3 | ||
2 0.67 0.50 0.57 4 | ||
3 0.07 0.33 0.11 3 | ||
4 0.67 0.33 0.44 6 | ||
5 0.44 1.00 0.62 4 | ||
6 0.60 0.75 0.67 4 | ||
7 0.29 0.67 0.40 3 | ||
8 0.09 0.40 0.14 5 | ||
9 0.50 0.60 0.55 5 | ||
10 0.30 0.75 0.43 4 | ||
11 0.50 0.50 0.50 4 | ||
12 0.43 0.50 0.46 6 | ||
13 0.25 1.00 0.40 2 | ||
14 1.00 0.20 0.33 5 | ||
15 0.44 0.80 0.57 5 | ||
16 0.10 0.33 0.15 3 | ||
17 0.00 0.00 0.00 7 | ||
|
||
accuracy 0.39 156 | ||
macro avg 0.45 0.53 0.42 156 | ||
weighted avg 0.58 0.39 0.41 156 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
precision recall f1-score support | ||
|
||
0 0.71 0.29 0.41 83 | ||
1 1.00 0.67 0.80 3 | ||
2 1.00 0.60 0.75 5 | ||
3 0.13 0.50 0.21 4 | ||
4 0.67 0.33 0.44 6 | ||
5 0.44 1.00 0.62 4 | ||
6 0.60 0.75 0.67 4 | ||
7 0.43 0.75 0.55 4 | ||
8 0.09 0.40 0.14 5 | ||
9 0.50 1.00 0.67 3 | ||
10 0.30 0.75 0.43 4 | ||
11 0.50 0.50 0.50 4 | ||
12 0.43 0.60 0.50 5 | ||
13 0.50 1.00 0.67 4 | ||
14 1.00 0.20 0.33 5 | ||
15 0.44 0.80 0.57 5 | ||
16 0.10 0.33 0.15 3 | ||
17 0.00 0.00 0.00 5 | ||
|
||
accuracy 0.42 156 | ||
macro avg 0.49 0.58 0.47 156 | ||
weighted avg 0.60 0.42 0.43 156 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
precision recall f1-score support | ||
|
||
0 0.71 0.29 0.41 83 | ||
1 1.00 0.67 0.80 3 | ||
2 0.67 0.50 0.57 4 | ||
3 0.12 0.67 0.20 3 | ||
4 0.33 0.17 0.22 6 | ||
5 0.40 1.00 0.57 4 | ||
6 0.50 0.50 0.50 4 | ||
7 0.25 0.67 0.36 3 | ||
8 0.09 0.40 0.15 5 | ||
9 0.33 0.40 0.36 5 | ||
10 0.38 0.75 0.50 4 | ||
11 0.67 0.50 0.57 4 | ||
12 0.33 0.50 0.40 6 | ||
13 0.25 1.00 0.40 2 | ||
14 0.67 0.40 0.50 5 | ||
15 0.50 0.80 0.62 5 | ||
16 0.12 0.33 0.18 3 | ||
17 0.00 0.00 0.00 7 | ||
|
||
accuracy 0.38 156 | ||
macro avg 0.41 0.53 0.41 156 | ||
weighted avg 0.55 0.38 0.40 156 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
precision recall f1-score support | ||
|
||
0 0.71 0.29 0.41 83 | ||
1 1.00 0.67 0.80 3 | ||
2 1.00 0.60 0.75 5 | ||
3 0.18 0.75 0.29 4 | ||
4 0.33 0.17 0.22 6 | ||
5 0.40 1.00 0.57 4 | ||
6 0.50 0.50 0.50 4 | ||
7 0.38 0.75 0.50 4 | ||
8 0.09 0.40 0.15 5 | ||
9 0.33 0.67 0.44 3 | ||
10 0.38 0.75 0.50 4 | ||
11 0.67 0.50 0.57 4 | ||
12 0.33 0.60 0.43 5 | ||
13 0.50 1.00 0.67 4 | ||
14 0.67 0.40 0.50 5 | ||
15 0.50 0.80 0.62 5 | ||
16 0.12 0.33 0.18 3 | ||
17 0.00 0.00 0.00 5 | ||
|
||
accuracy 0.42 156 | ||
macro avg 0.45 0.57 0.45 156 | ||
weighted avg 0.58 0.42 0.42 156 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
precision recall f1-score support | ||
|
||
0 0.70 0.28 0.40 83 | ||
1 1.00 0.67 0.80 3 | ||
2 0.67 0.50 0.57 4 | ||
3 0.12 0.67 0.20 3 | ||
4 0.33 0.17 0.22 6 | ||
5 0.44 1.00 0.62 4 | ||
6 0.50 0.50 0.50 4 | ||
7 0.22 0.67 0.33 3 | ||
8 0.09 0.40 0.15 5 | ||
9 0.33 0.40 0.36 5 | ||
10 0.33 0.75 0.46 4 | ||
11 0.67 0.50 0.57 4 | ||
12 0.38 0.50 0.43 6 | ||
13 0.25 1.00 0.40 2 | ||
14 0.67 0.40 0.50 5 | ||
15 0.50 0.80 0.62 5 | ||
16 0.11 0.33 0.17 3 | ||
17 0.00 0.00 0.00 7 | ||
|
||
accuracy 0.38 156 | ||
macro avg 0.41 0.53 0.41 156 | ||
weighted avg 0.55 0.38 0.39 156 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
precision recall f1-score support | ||
|
||
0 0.70 0.28 0.40 83 | ||
1 1.00 0.67 0.80 3 | ||
2 1.00 0.60 0.75 5 | ||
3 0.18 0.75 0.29 4 | ||
4 0.33 0.17 0.22 6 | ||
5 0.44 1.00 0.62 4 | ||
6 0.50 0.50 0.50 4 | ||
7 0.33 0.75 0.46 4 | ||
8 0.09 0.40 0.15 5 | ||
9 0.33 0.67 0.44 3 | ||
10 0.33 0.75 0.46 4 | ||
11 0.67 0.50 0.57 4 | ||
12 0.38 0.60 0.46 5 | ||
13 0.50 1.00 0.67 4 | ||
14 0.67 0.40 0.50 5 | ||
15 0.50 0.80 0.62 5 | ||
16 0.11 0.33 0.17 3 | ||
17 0.00 0.00 0.00 5 | ||
|
||
accuracy 0.41 156 | ||
macro avg 0.45 0.56 0.45 156 | ||
weighted avg 0.57 0.41 0.42 156 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
precision recall f1-score support | ||
|
||
0 0.77 0.48 0.59 83 | ||
1 0.67 0.67 0.67 3 | ||
2 0.80 1.00 0.89 4 | ||
3 0.12 0.33 0.18 3 | ||
4 0.50 0.17 0.25 6 | ||
5 0.57 1.00 0.73 4 | ||
6 1.00 0.75 0.86 4 | ||
7 0.50 1.00 0.67 3 | ||
8 0.04 0.20 0.06 5 | ||
9 0.43 0.60 0.50 5 | ||
10 0.27 0.75 0.40 4 | ||
11 1.00 0.50 0.67 4 | ||
12 0.75 0.50 0.60 6 | ||
13 0.22 1.00 0.36 2 | ||
14 1.00 0.80 0.89 5 | ||
15 0.83 1.00 0.91 5 | ||
16 0.00 0.00 0.00 3 | ||
17 0.00 0.00 0.00 7 | ||
|
||
accuracy 0.52 156 | ||
macro avg 0.53 0.60 0.51 156 | ||
weighted avg 0.65 0.52 0.55 156 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
precision recall f1-score support | ||
|
||
0 0.77 0.48 0.59 83 | ||
1 0.67 0.67 0.67 3 | ||
2 1.00 1.00 1.00 5 | ||
3 0.12 0.33 0.18 3 | ||
4 0.50 0.17 0.25 6 | ||
5 0.57 1.00 0.73 4 | ||
6 1.00 0.75 0.86 4 | ||
7 0.50 1.00 0.67 3 | ||
8 0.04 0.20 0.06 5 | ||
9 0.57 0.80 0.67 5 | ||
10 0.27 0.75 0.40 4 | ||
11 1.00 0.50 0.67 4 | ||
12 0.75 0.60 0.67 5 | ||
13 0.56 1.00 0.71 5 | ||
14 1.00 0.80 0.89 5 | ||
15 0.83 1.00 0.91 5 | ||
16 0.00 0.00 0.00 3 | ||
17 0.00 0.00 0.00 4 | ||
|
||
accuracy 0.55 156 | ||
macro avg 0.56 0.61 0.55 156 | ||
weighted avg 0.68 0.55 0.58 156 |