Auto metric (#54)

* AutoMetric structure completed. * version update as 2.1.0 (from 2.0.0). * Test cases added for sequence-classification metrics (accuracy, precision, recall, f1). * Seqeval added to metric requirements in setup.py. * Docstrings revised. * Multiple predictions are added for sequence classification metrics and handled for sequence labeling metrics `seqeval`. - Associated tests are added.
obss · Oct 25, 2021 · 8801eb1 · 8801eb1
1 parent bca6ca4
commit 8801eb1
Show file tree

Hide file tree

Showing 86 changed files with 2,745 additions and 591 deletions.
diff --git a/README.md b/README.md
@@ -72,7 +72,7 @@ You can directly import metrics from `jury.metrics` as classes, and then instant
 ```python
 from jury.metrics import Bleu
 
-bleu = Bleu()
+bleu = Bleu.construct()
 score = bleu.compute(predictions=predictions, references=references)
 ```
 
@@ -81,14 +81,14 @@ The additional parameters can either be specified on `compute()`
 ```python
 from jury.metrics import Bleu
 
-bleu = Bleu()
+bleu = Bleu.construct()
 score = bleu.compute(predictions=predictions, references=references, max_order=4)
 ```
 
 , or alternatively on instantiation
 
 ```python
-bleu = Bleu(params={"max_order": 1})
+bleu = Bleu._construct(compute_kwargs={"max_order": 1})
 ```
 
 Note that you can seemlessly access both `jury` and `datasets` metrics through `jury.load_metric`. 
@@ -97,7 +97,7 @@ Note that you can seemlessly access both `jury` and `datasets` metrics through `
 import jury
 
 bleu = jury.load_metric("bleu")
-bleu_1 = jury.load_metric("bleu", resulting_name="bleu_1", params={"max_order": 1})
+bleu_1 = jury.load_metric("bleu", resulting_name="bleu_1", compute_kwargs={"max_order": 1})
 # metrics not available in `jury` but in `datasets`
 wer = jury.load_metric("wer") # It falls back to `datasets` package with a warning
 ```
@@ -139,20 +139,21 @@ Jury itself uses `datasets.Metric` as a base class to drive its own base class a
 As a custom metric both base classes can be used; however, we strongly recommend using `jury.metrics.Metric` as it has several advantages such as supporting computations for the input types above or unifying the type of the input.
 
 ```python
-    from jury.metrics import Metric
-    
-    class CustomMetric(Metric):
-            def _compute_single_pred_single_ref(
-        self, predictions: Collator, references: Collator, reduce_fn: Callable = None, **kwargs
+from jury.metrics import MetricForTask
+
+class CustomMetric(MetricForTask):
+    def _compute_single_pred_single_ref(
+        self, predictions, references, reduce_fn = None, **kwargs
     ):
         raise NotImplementedError
 
     def _compute_single_pred_multi_ref(
-        self, predictions: Collator, references: Collator, reduce_fn: Callable, **kwargs
+        self, predictions, references, reduce_fn = None, **kwargs
     ):
         raise NotImplementedError
 
-    def _compute_multi_pred_multi_ref(self, predictions: Collator, references: Collator, reduce_fn: Callable, **kwargs
+    def _compute_multi_pred_multi_ref(
+            self, predictions, references, reduce_fn = None, **kwargs
     ):
         raise NotImplementedError
 ```

diff --git a/jury/__init__.py b/jury/__init__.py
@@ -1,9 +1,4 @@
-import os
-
 from jury.core import Jury
-from jury.metrics import load_metric
-
-SOURCE_DIR = os.path.abspath(os.path.dirname(__file__))
-PROJECT_ROOT = os.path.dirname(SOURCE_DIR)
+from jury.metrics import AutoMetric, list_metrics, load_metric
 
-__version__ = "2.0.0"
+__version__ = "2.1.0"
diff --git a/jury/collator.py b/jury/collator.py
@@ -22,7 +22,7 @@ def collapse(self):
         return Collator(np.ravel(self).tolist(), keep=True)
 
     def nested(self):
-        return Collator(self.from_list_of_str(self))
+        return Collator(self.from_list(self))
 
     def reshape(self, *args):
         _seq = np.array(self, dtype=object)
@@ -53,16 +53,17 @@ def _construct(self, sequence: Union[str, List[str], List[List[str]]], keep: boo
         if keep:
             return sequence
 
-        _type = NestedSingleType.get_type(sequence)
-        if _type == "str":
+        _type_primary = NestedSingleType.get_type(sequence, order=0)
+        _type_secondary = NestedSingleType.get_type(sequence, order=1)
+        if _type_primary in ["str", "dict"]:
             sequence = self.from_str(sequence)
-        elif _type == "list<str>" or _type == "list<dict>":
-            sequence = self.from_list_of_str(sequence)
+        elif _type_primary == "list" and _type_secondary != "list":
+            sequence = self.from_list(sequence)
 
         return sequence
 
     @staticmethod
-    def from_list_of_str(seq: List[str]):
+    def from_list(seq: List[str]):
         return [[item] for item in seq]
 
     @classmethod

diff --git a/jury/core.py b/jury/core.py
@@ -1,10 +1,11 @@
 from concurrent.futures import ProcessPoolExecutor
-from typing import Any, Callable, Dict, List, Mapping, Optional, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
 
 from jury.collator import Collator
 from jury.definitions import DEFAULT_METRICS
-from jury.metrics import Metric, load_metric
-from jury.utils import replace, set_env
+from jury.metrics import load_metric
+from jury.metrics._core import EvaluationInstance, Metric
+from jury.utils import pop_item_from_dict, replace, set_env
 
 MetricParam = Union[str, Metric, Dict[str, Any]]
 
@@ -46,11 +47,14 @@ def __init__(
         self.metrics = self._load_metrics(metrics)
         self._concurrent = run_concurrent
 
+        # Sanity check
+        self._validate_metrics()
+
     def __call__(
         self,
         *,
-        predictions: Union[List[str], List[List[str]]] = None,
-        references: Union[List[str], List[List[str]]] = None,
+        predictions: EvaluationInstance = None,
+        references: EvaluationInstance = None,
         reduce_fn: Optional[Union[str, Callable]] = None,
     ) -> Dict[str, float]:
         """Restricts positional arguments to prevent potential inconsistency between predictions and references."""
@@ -89,10 +93,20 @@ def _load_multiple_metrics(self, metrics: Union[List[str], List[Dict[str, Any]],
                 metrics = replace(metrics, load_metric(metric_name.lower()), i)
             elif isinstance(metric_param, dict):
                 metric_name = metric_param.pop("metric_name")  # must be given
-                resulting_name = metric_param.pop("resulting_name") if "resulting_name" in metric_param else None
-                params = metric_param
+                task = pop_item_from_dict(metric_param, "task")
+                resulting_name = pop_item_from_dict(metric_param, "resulting_name")
+                compute_kwargs = pop_item_from_dict(metric_param, "compute_kwargs")
+                kwargs = metric_param
                 metrics = replace(
-                    metrics, load_metric(metric_name=metric_name, resulting_name=resulting_name, params=params), i
+                    metrics,
+                    load_metric(
+                        metric_name=metric_name,
+                        task=task,
+                        resulting_name=resulting_name,
+                        compute_kwargs=compute_kwargs,
+                        **kwargs,
+                    ),
+                    i,
                 )
             elif isinstance(metric_param, Metric):
                 continue
@@ -101,7 +115,8 @@ def _load_multiple_metrics(self, metrics: Union[List[str], List[Dict[str, Any]],
     def _load_metrics(self, metrics: Union[MetricParam, List[MetricParam]]) -> List[Metric]:
         if metrics is None:
             metrics = DEFAULT_METRICS
-        elif isinstance(metrics, (str, Metric)):
+
+        if isinstance(metrics, (str, Metric)):
             metrics = self._load_single_metric(metrics)
         elif isinstance(metrics, list):
             metrics = self._load_multiple_metrics(metrics)
@@ -133,9 +148,20 @@ def _prepare_concurrent_inputs(self, predictions, references, reduce_fn):
             inputs.append((metric, predictions, references, reduce_fn))
         return inputs
 
+    def _validate_metrics(self):
+        metrics = self.metrics
+        if all([isinstance(metric, Metric) for metric in metrics]):
+            task = metrics[0].task
+            if not all([metric.task == task for metric in metrics]):
+                raise ValueError(
+                    "Given metrics are not suitable to be used together, metrics must be of same the task."
+                )
+        return True
+
     def add_metric(self, metric_name: str, resulting_name: str = None, params: Dict = None) -> None:
         metric = load_metric(metric_name, resulting_name=resulting_name, params=params)
         self.metrics.append(metric)
+        self._validate_metrics()
 
     def remove_metric(self, resulting_name: str, error: bool = True) -> None:
         for i, metric in enumerate(self.metrics):

diff --git a/jury/definitions.py b/jury/definitions.py
@@ -1,10 +1,8 @@
-from jury.metrics import Bleu, Meteor, Rouge
-
 DEFAULT_METRICS = [
-    Bleu(params={"max_order": 1}),
-    Bleu(params={"max_order": 2}),
-    Bleu(params={"max_order": 3}),
-    Bleu(params={"max_order": 4}),
-    Meteor(),
-    Rouge(),
+    {"metric_name": "bleu", "compute_kwargs": {"max_order": 1}},
+    {"metric_name": "bleu", "compute_kwargs": {"max_order": 2}},
+    {"metric_name": "bleu", "compute_kwargs": {"max_order": 3}},
+    {"metric_name": "bleu", "compute_kwargs": {"max_order": 4}},
+    {"metric_name": "meteor"},
+    {"metric_name": "rouge"},
 ]
diff --git a/jury/metrics/__init__.py b/jury/metrics/__init__.py
@@ -1,4 +1,4 @@
-from jury.metrics._base import Metric, load_metric
+from jury.metrics._core import AutoMetric, MetricForTask, list_metrics, load_metric
 from jury.metrics.accuracy import Accuracy
 from jury.metrics.bertscore import Bertscore
 from jury.metrics.bleu import Bleu
@@ -12,14 +12,16 @@
 
 __all__ = [
     "Accuracy",
+    "AutoMetric",
     "Bertscore",
     "Bleu",
     "F1",
     "Meteor",
-    "Metric",
     "Precision",
     "Recall",
     "Rouge",
     "Sacrebleu",
     "Squad",
+    "load_metric",
+    "list_metrics",
 ]