Skip to content

Commit

Permalink
Auto metric (#54)
Browse files Browse the repository at this point in the history
* AutoMetric structure completed.

* version update as 2.1.0 (from 2.0.0).

* Test cases added for sequence-classification metrics (accuracy, precision, recall, f1).

* Seqeval added to metric requirements in setup.py.

* Docstrings revised.

* Multiple predictions are added for sequence classification metrics and handled for sequence labeling metrics `seqeval`.
- Associated tests are added.
  • Loading branch information
devrimcavusoglu authored Oct 25, 2021
1 parent bca6ca4 commit 8801eb1
Show file tree
Hide file tree
Showing 86 changed files with 2,745 additions and 591 deletions.
23 changes: 12 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ You can directly import metrics from `jury.metrics` as classes, and then instant
```python
from jury.metrics import Bleu

bleu = Bleu()
bleu = Bleu.construct()
score = bleu.compute(predictions=predictions, references=references)
```

Expand All @@ -81,14 +81,14 @@ The additional parameters can either be specified on `compute()`
```python
from jury.metrics import Bleu

bleu = Bleu()
bleu = Bleu.construct()
score = bleu.compute(predictions=predictions, references=references, max_order=4)
```

, or alternatively on instantiation

```python
bleu = Bleu(params={"max_order": 1})
bleu = Bleu._construct(compute_kwargs={"max_order": 1})
```

Note that you can seemlessly access both `jury` and `datasets` metrics through `jury.load_metric`.
Expand All @@ -97,7 +97,7 @@ Note that you can seemlessly access both `jury` and `datasets` metrics through `
import jury

bleu = jury.load_metric("bleu")
bleu_1 = jury.load_metric("bleu", resulting_name="bleu_1", params={"max_order": 1})
bleu_1 = jury.load_metric("bleu", resulting_name="bleu_1", compute_kwargs={"max_order": 1})
# metrics not available in `jury` but in `datasets`
wer = jury.load_metric("wer") # It falls back to `datasets` package with a warning
```
Expand Down Expand Up @@ -139,20 +139,21 @@ Jury itself uses `datasets.Metric` as a base class to drive its own base class a
As a custom metric both base classes can be used; however, we strongly recommend using `jury.metrics.Metric` as it has several advantages such as supporting computations for the input types above or unifying the type of the input.

```python
from jury.metrics import Metric
class CustomMetric(Metric):
def _compute_single_pred_single_ref(
self, predictions: Collator, references: Collator, reduce_fn: Callable = None, **kwargs
from jury.metrics import MetricForTask

class CustomMetric(MetricForTask):
def _compute_single_pred_single_ref(
self, predictions, references, reduce_fn = None, **kwargs
):
raise NotImplementedError

def _compute_single_pred_multi_ref(
self, predictions: Collator, references: Collator, reduce_fn: Callable, **kwargs
self, predictions, references, reduce_fn = None, **kwargs
):
raise NotImplementedError

def _compute_multi_pred_multi_ref(self, predictions: Collator, references: Collator, reduce_fn: Callable, **kwargs
def _compute_multi_pred_multi_ref(
self, predictions, references, reduce_fn = None, **kwargs
):
raise NotImplementedError
```
Expand Down
9 changes: 2 additions & 7 deletions jury/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
import os

from jury.core import Jury
from jury.metrics import load_metric

SOURCE_DIR = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(SOURCE_DIR)
from jury.metrics import AutoMetric, list_metrics, load_metric

__version__ = "2.0.0"
__version__ = "2.1.0"
13 changes: 7 additions & 6 deletions jury/collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def collapse(self):
return Collator(np.ravel(self).tolist(), keep=True)

def nested(self):
return Collator(self.from_list_of_str(self))
return Collator(self.from_list(self))

def reshape(self, *args):
_seq = np.array(self, dtype=object)
Expand Down Expand Up @@ -53,16 +53,17 @@ def _construct(self, sequence: Union[str, List[str], List[List[str]]], keep: boo
if keep:
return sequence

_type = NestedSingleType.get_type(sequence)
if _type == "str":
_type_primary = NestedSingleType.get_type(sequence, order=0)
_type_secondary = NestedSingleType.get_type(sequence, order=1)
if _type_primary in ["str", "dict"]:
sequence = self.from_str(sequence)
elif _type == "list<str>" or _type == "list<dict>":
sequence = self.from_list_of_str(sequence)
elif _type_primary == "list" and _type_secondary != "list":
sequence = self.from_list(sequence)

return sequence

@staticmethod
def from_list_of_str(seq: List[str]):
def from_list(seq: List[str]):
return [[item] for item in seq]

@classmethod
Expand Down
44 changes: 35 additions & 9 deletions jury/core.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from concurrent.futures import ProcessPoolExecutor
from typing import Any, Callable, Dict, List, Mapping, Optional, Union
from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union

from jury.collator import Collator
from jury.definitions import DEFAULT_METRICS
from jury.metrics import Metric, load_metric
from jury.utils import replace, set_env
from jury.metrics import load_metric
from jury.metrics._core import EvaluationInstance, Metric
from jury.utils import pop_item_from_dict, replace, set_env

MetricParam = Union[str, Metric, Dict[str, Any]]

Expand Down Expand Up @@ -46,11 +47,14 @@ def __init__(
self.metrics = self._load_metrics(metrics)
self._concurrent = run_concurrent

# Sanity check
self._validate_metrics()

def __call__(
self,
*,
predictions: Union[List[str], List[List[str]]] = None,
references: Union[List[str], List[List[str]]] = None,
predictions: EvaluationInstance = None,
references: EvaluationInstance = None,
reduce_fn: Optional[Union[str, Callable]] = None,
) -> Dict[str, float]:
"""Restricts positional arguments to prevent potential inconsistency between predictions and references."""
Expand Down Expand Up @@ -89,10 +93,20 @@ def _load_multiple_metrics(self, metrics: Union[List[str], List[Dict[str, Any]],
metrics = replace(metrics, load_metric(metric_name.lower()), i)
elif isinstance(metric_param, dict):
metric_name = metric_param.pop("metric_name") # must be given
resulting_name = metric_param.pop("resulting_name") if "resulting_name" in metric_param else None
params = metric_param
task = pop_item_from_dict(metric_param, "task")
resulting_name = pop_item_from_dict(metric_param, "resulting_name")
compute_kwargs = pop_item_from_dict(metric_param, "compute_kwargs")
kwargs = metric_param
metrics = replace(
metrics, load_metric(metric_name=metric_name, resulting_name=resulting_name, params=params), i
metrics,
load_metric(
metric_name=metric_name,
task=task,
resulting_name=resulting_name,
compute_kwargs=compute_kwargs,
**kwargs,
),
i,
)
elif isinstance(metric_param, Metric):
continue
Expand All @@ -101,7 +115,8 @@ def _load_multiple_metrics(self, metrics: Union[List[str], List[Dict[str, Any]],
def _load_metrics(self, metrics: Union[MetricParam, List[MetricParam]]) -> List[Metric]:
if metrics is None:
metrics = DEFAULT_METRICS
elif isinstance(metrics, (str, Metric)):

if isinstance(metrics, (str, Metric)):
metrics = self._load_single_metric(metrics)
elif isinstance(metrics, list):
metrics = self._load_multiple_metrics(metrics)
Expand Down Expand Up @@ -133,9 +148,20 @@ def _prepare_concurrent_inputs(self, predictions, references, reduce_fn):
inputs.append((metric, predictions, references, reduce_fn))
return inputs

def _validate_metrics(self):
metrics = self.metrics
if all([isinstance(metric, Metric) for metric in metrics]):
task = metrics[0].task
if not all([metric.task == task for metric in metrics]):
raise ValueError(
"Given metrics are not suitable to be used together, metrics must be of same the task."
)
return True

def add_metric(self, metric_name: str, resulting_name: str = None, params: Dict = None) -> None:
metric = load_metric(metric_name, resulting_name=resulting_name, params=params)
self.metrics.append(metric)
self._validate_metrics()

def remove_metric(self, resulting_name: str, error: bool = True) -> None:
for i, metric in enumerate(self.metrics):
Expand Down
14 changes: 6 additions & 8 deletions jury/definitions.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from jury.metrics import Bleu, Meteor, Rouge

DEFAULT_METRICS = [
Bleu(params={"max_order": 1}),
Bleu(params={"max_order": 2}),
Bleu(params={"max_order": 3}),
Bleu(params={"max_order": 4}),
Meteor(),
Rouge(),
{"metric_name": "bleu", "compute_kwargs": {"max_order": 1}},
{"metric_name": "bleu", "compute_kwargs": {"max_order": 2}},
{"metric_name": "bleu", "compute_kwargs": {"max_order": 3}},
{"metric_name": "bleu", "compute_kwargs": {"max_order": 4}},
{"metric_name": "meteor"},
{"metric_name": "rouge"},
]
6 changes: 4 additions & 2 deletions jury/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from jury.metrics._base import Metric, load_metric
from jury.metrics._core import AutoMetric, MetricForTask, list_metrics, load_metric
from jury.metrics.accuracy import Accuracy
from jury.metrics.bertscore import Bertscore
from jury.metrics.bleu import Bleu
Expand All @@ -12,14 +12,16 @@

__all__ = [
"Accuracy",
"AutoMetric",
"Bertscore",
"Bleu",
"F1",
"Meteor",
"Metric",
"Precision",
"Recall",
"Rouge",
"Sacrebleu",
"Squad",
"load_metric",
"list_metrics",
]
Loading

0 comments on commit 8801eb1

Please sign in to comment.