feat: LEAP-1340: Add label_order_matters option to naive comparison f…

…or timelinelabels (#152)
HumanSignal · Aug 21, 2024 · 73bf332 · 73bf332
1 parent 3792573
commit 73bf332
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 5 deletions.
diff --git a/evalme/classification.py b/evalme/classification.py
@@ -119,9 +119,12 @@ def exact_matching_pairwise(item_gt, item_pred, label_weights=None, per_label=Fa
                                                                   per_label=per_label)
 
 
-def naive(x, y, per_label=False, **kwargs):
+def naive(x, y, per_label=False, label_order_matters=True, **kwargs):
     """
     Naive comparison of annotations
+
+    If label order doesn't matter, we consider y's whole result array to find an exact match for each item from x['result'].
+    This could be made more efficient by sorting the results first, but we don't do that yet.
     """
     # extract results from annotations
     if isinstance(x, dict) and isinstance(y, dict):
@@ -147,8 +150,13 @@ def naive(x, y, per_label=False, **kwargs):
                     for label in labels:
                         # for taxonomy and other non-str labels
                         label = str(label)
-                        if x[i]['value'] == y[i]['value']:
-                            results[label] += 1
+                        y_indexes = list(range(len(y)))
+                        if label_order_matters:
+                            y_indexes = [i]
+                        for y_index in y_indexes:
+                            if x[i]['value'] == y[y_index]['value']:
+                                results[label] += 1
+                                break
                         counts[label] += 1
                 for label in counts:
                     result[label] = results[label] / counts[label]
@@ -157,7 +165,12 @@ def naive(x, y, per_label=False, **kwargs):
         else:
             result = 0
             for i in range(len(x)):
-                if x[i]['value'] == y[i]['value']:
-                    result += 1
+                y_indexes = list(range(len(y)))
+                if label_order_matters:
+                    y_indexes = [i]
+                for y_index in y_indexes:
+                    if x[i]['value'] == y[y_index]['value']:
+                        result += 1
+                        break
             result = result / len(x)
     return result
diff --git a/evalme/tests/test_classification.py b/evalme/tests/test_classification.py
@@ -1,5 +1,6 @@
 import pytest
 
+from copy import deepcopy
 from evalme.classification import ClassificationEvalItem, ChoicesEvalItem, naive, exact_matching_choices
 
 from evalme.metrics import Metrics
@@ -261,6 +262,91 @@ def test_naive_not_matching():
     assert Metrics.apply({}, test_data[0], test_data[1], metric_name='naive') == 0
 
 
+def test_naive_order_doesnt_matter():
+    first = {
+        "type": "timelinelabels",
+        "value": {
+            "ranges": [
+                {
+                    "end": 7,
+                    "start": 1
+                }
+            ],
+            "timelinelabels": [
+                "Dromedary"
+            ]
+        },
+        "to_name": "video",
+        "from_name": "videolabels"
+    }
+    second = {
+        "type": "timelinelabels",
+        "value": {
+            "ranges": [
+            {
+                "end": 6,
+                "start": 1
+            }
+            ],
+            "timelinelabels": [
+                "Bactrian"
+            ]
+        },
+        "to_name": "video",
+        "from_name": "videolabels"
+    }
+
+    test_x = [first, second]
+    test_y = [second, first]
+
+    assert naive(test_x, test_y, label_order_matters=False) == 1.0
+    assert naive(test_x, test_y) == 0.0
+
+
+def test_naive_order_doesnt_matter_partial_agreement():
+    first = {
+        "type": "timelinelabels",
+        "value": {
+            "ranges": [
+                {
+                    "end": 7,
+                    "start": 1
+                }
+            ],
+            "timelinelabels": [
+                "Dromedary"
+            ]
+        },
+        "to_name": "video",
+        "from_name": "videolabels"
+    }
+    first_2 = deepcopy(first)
+    first_2["value"]["ranges"][0]["end"] = 5
+
+    second = {
+        "type": "timelinelabels",
+        "value": {
+            "ranges": [
+            {
+                "end": 6,
+                "start": 1
+            }
+            ],
+            "timelinelabels": [
+                "Bactrian"
+            ]
+        },
+        "to_name": "video",
+        "from_name": "videolabels"
+    }
+
+    test_x = [first, second]
+    test_y = [second, first_2]
+
+    assert naive(test_x, test_y, label_order_matters=False) == 0.5
+    assert naive(test_x, test_y) == 0.0
+
+
 def test_naive_not_matching_per_label():
     test_data = [[
         {