Skip to content

Commit

Permalink
Adding BoomerEngine.compare, addresses INCATools/boomer#334
Browse files Browse the repository at this point in the history
  • Loading branch information
cmungall committed Jan 31, 2023
1 parent e8699d7 commit b1e4a9b
Show file tree
Hide file tree
Showing 4 changed files with 228 additions and 27 deletions.
20 changes: 14 additions & 6 deletions src/oaklib/parsers/boomer_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,21 @@
)
from oaklib.parsers.parser_base import Parser

re_new_block = re.compile(r"^## (.*)")
re_tag_val = re.compile(r"^(\S+): (.*)")
re_mapping1 = re.compile(r"^- (\[\S+)\s+(\w+)\s+(\S+\))\s+\(most probable\)\s+(\S+)$")
re_mapping2 = re.compile(r"^- (\[\S+)\s+(\w+)\s+(\S+\))\s+(\S+)$")
re_md_link = re.compile(r"^\[(.*)\]\((\S+)\)")

BOOMER_TO_SKOS = {
"SiblingOf": SKOS_CLOSE_MATCH,
"EquivalentTo": SKOS_EXACT_MATCH,
"ProperSubClassOf": SKOS_BROAD_MATCH,
"ProperSuperClassOf": SKOS_NARROW_MATCH,
}

BOOMER_PRED_RE = "|".join(BOOMER_TO_SKOS.keys())

re_new_block = re.compile(r"^## (.*)")
re_tag_val = re.compile(r"^(\w.*):\s*(.*)")
re_mapping1 = re.compile(rf"^- (\[.+)\s+({BOOMER_PRED_RE})\s+(.+\))\s+\(most probable\)\s+(\S+)$")
re_mapping2 = re.compile(rf"^- (\[.+)\s+({BOOMER_PRED_RE})\s+(.+\))\s+(\S+)$")
re_md_link = re.compile(r"^\[(.*)\]\((\S+)\)")


@dataclass
class BoomerParser(Parser):
Expand Down Expand Up @@ -62,6 +64,10 @@ def parse(self, file: TextIO) -> Iterator[MappingCluster]:
cluster.posterior_probability = float(val)
elif tag == "Confidence":
cluster.confidence = float(val)
elif tag.startswith("Subsequent"):
pass
else:
logging.warning(f"Unparsed tag {tag} in {line}")
continue
m = re_mapping1.match(line)
if not m:
Expand All @@ -85,6 +91,8 @@ def parse(self, file: TextIO) -> Iterator[MappingCluster]:
)
cluster.resolved_mappings.append(mapping)
continue
if line:
logging.warning(f"Cannot parse: {line}")
if not cluster:
logging.warning("No clusters in file")
return
Expand Down
131 changes: 112 additions & 19 deletions src/oaklib/utilities/mapping/boomer_utils.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
import logging
from collections import defaultdict
from copy import copy
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import Iterator, List, Optional, TextIO, Union, Dict, Tuple
from typing import Dict, Iterator, List, Optional, TextIO, Tuple, Union

import click
import sssom_schema as sssom
from oaklib import get_implementation_from_shorthand
from oaklib.interfaces import MappingProviderInterface
from oaklib.io.streaming_csv_writer import StreamingCsvWriter
from oaklib.types import CURIE, PRED_CURIE
from sssom.constants import SEMAPV
from sssom.constants import SEMAPV, SKOS_EXACT_MATCH

from oaklib import get_implementation_from_shorthand
from oaklib.datamodels.mapping_cluster_datamodel import (
MappingCluster,
MappingClusterReport,
)
from oaklib.datamodels.vocabulary import HAS_DBXREF, SKOS_CLOSE_MATCH
from oaklib.interfaces import MappingProviderInterface
from oaklib.io.streaming_csv_writer import StreamingCsvWriter
from oaklib.parsers.boomer_parser import BoomerParser
from oaklib.types import CURIE, PRED_CURIE
from oaklib.utilities.mapping.sssom_utils import StreamingSssomWriter

logger = logging.getLogger(__name__)
Expand All @@ -31,6 +33,7 @@ class DiffType(Enum):
AMBIGUOUS = "AMBIGUOUS"
NEW = "NEW"
CONFLICT = "CONFLICT"
REJECT = "REJECT"


MAPPING_DIFF = Tuple[DiffType, PRED_CURIE, sssom.Mapping, Optional[float]]
Expand All @@ -39,6 +42,7 @@ class DiffType(Enum):
def _predicate_ids(mappings: List[sssom.Mapping]) -> List[CURIE]:
return list(set([m.predicate_id for m in mappings]))


@dataclass
class BoomerEngine:
"""
Expand Down Expand Up @@ -108,7 +112,9 @@ def load(self, path: Union[TextIO, Path, str]) -> MappingClusterReport:
self.report = MappingClusterReport(clusters=clusters)
return self.report

def index_mappings(self, mappings: List[sssom.Mapping], max_per_pair: Optional[int] = None) -> MAPPING_SP_INDEX:
def index_mappings(
self, mappings: List[sssom.Mapping], max_per_pair: Optional[int] = None
) -> MAPPING_SP_INDEX:
mix: MAPPING_SP_INDEX = defaultdict(list)
for m in mappings:
pair = (m.subject_id, m.object_id)
Expand All @@ -117,8 +123,27 @@ def index_mappings(self, mappings: List[sssom.Mapping], max_per_pair: Optional[i
raise ValueError(f"Too many for {pair} => {mix[pair]}")
return mix

def compare(self, current_mappings: List[sssom.Mapping], minimum_confidence: Optional[float] = 0.95) -> Iterator[MAPPING_DIFF]:
boomer_mapping_ix = self.index_mappings(list(self.mappings(minimum_confidence=minimum_confidence)), max_per_pair=1)
def compare(
self,
current_mappings: List[sssom.Mapping],
minimum_confidence: Optional[float] = 0.95,
reject_non_exact=False,
promote_xref_to_exact=False,
discard_new_close_matches=True,
) -> Iterator[MAPPING_DIFF]:
"""
Compares a set of pre-existing mappings with boomer resolved mappings.
:param current_mappings: source mappings to evaluate
:param minimum_confidence: any boomer resolved mapping that has confidence beneath this is ignored
:param reject_non_exact: if True, then any mapping that matches a confident resolved mapping is typed REJECT
:param promote_xref_to_exact: if True, then any xref in the source is treated as skos:exactMatch
:param discard_new_close_matches: if True, then do not suggest NEW lines for high confidence closeMatch
:return:
"""
boomer_mapping_ix = self.index_mappings(
list(self.mappings(minimum_confidence=minimum_confidence)), max_per_pair=1
)
current_mapping_ix = self.index_mappings(current_mappings)
for pair, mappings in boomer_mapping_ix.items():
[bm] = mappings
Expand All @@ -127,23 +152,36 @@ def compare(self, current_mappings: List[sssom.Mapping], minimum_confidence: Opt
current_mappings_for_pair = current_mapping_ix[pair]
if len(current_mappings_for_pair) > 1:
for m in current_mappings_for_pair:
yield DiffType.AMBIGUOUS, m.predicate_id, m, None
yield DiffType.AMBIGUOUS, None, m, None
for m in current_mappings_for_pair:
if promote_xref_to_exact and m.predicate_id == HAS_DBXREF:
m = copy(m)
m.predicate_id = SKOS_EXACT_MATCH
if m.predicate_id == boomer_pred_id:
yield DiffType.OK, m.predicate_id, m, bm.confidence
yield DiffType.OK, None, m, bm.confidence
else:
yield DiffType.CONFLICT, bm.predicate_id, m, bm.confidence
if reject_non_exact and bm.predicate_id != SKOS_EXACT_MATCH:
yield DiffType.REJECT, bm.predicate_id, m, bm.confidence
else:
yield DiffType.CONFLICT, bm.predicate_id, m, bm.confidence
else:
yield DiffType.NEW, bm.predicate_id, bm, bm.confidence
if reject_non_exact and bm.predicate_id != SKOS_EXACT_MATCH:
continue
if discard_new_close_matches and bm.predicate_id == SKOS_CLOSE_MATCH:
continue
yield DiffType.NEW, None, bm, bm.confidence


min_confidence_option = click.option(
"--minimum-confidence",
"-L",
type=click.FLOAT,
default=0.95,
show_default=True,
help="Do not show mappings with lower confidence",
)


@click.group()
@click.option("-v", "--verbose", count=True)
@click.option("-q", "--quiet")
Expand Down Expand Up @@ -178,7 +216,9 @@ def export(input_report, **kwargs):
"""
Exports mappings from a boomer report.
boomerang export tests/input/boomer-example.md
Example:
boomerang export tests/input/boomer-example.md
"""
ben = BoomerEngine()
ben.load(input_report)
Expand All @@ -188,15 +228,59 @@ def export(input_report, **kwargs):


@main.command()
@click.option("--input-ontology",
"-i",
help="use OAK selector syntax")
@click.option("--input-ontology", "-i", help="ontology from which to retrieve mappings")
@click.option(
"--reject-non-exact/--no-reject-non-exact",
help="if set then any match to a high confidence boomer interpretation that is a reject.",
)
@click.option(
"--promote-xref-to-exact/--no-promote-xref-to-exact",
help="if set then any xref in the source is promoted to an EXACT.",
)
@min_confidence_option
@click.argument("input_report")
def compare(input_report, input_ontology: str, **kwargs):
"""
Compares boomer results with existing mappings.
This assumes boomer has been executed in advance, and a markdown report generated.
Pass in as an argument the same ontology used in the boomer run.
Example:
boomerang foo-boomer.md -i foo.db
For any mapping marked NEW, this can be incorporated into the ontology.
For any mapping marked CONFLICT, there is some action that needs to be taken
By default any boomer resolved mapping beneath the default minimum confidence is ignored.
To customize, e.g. stringent:
Example:
boomerang foo-boomer.md -i foo.db -L 0.999
For each high confidence boomer mapping, this is compared against current mappings and
a suggestion made.
SPECIFIC SUGGESTIONS FOR OBO ONTOLOGIES:
In many ontologies it is conventional to (a) model all mappings as xrefs (b) assume
a default interpretation of exactMatch.
In these cases, we want to REJECT any existing xref IF there is a high confidence
boomer mapping FOR ANYTHING OTHER THAN exactMatch (including SiblingOf)
Example:
boomerang foo-boomer.md -i foo.db -L 0.999 --reject-non-exact --promote-xref-to-exact
The results here are straightforward, either REJECT, NEW, or OK
If this is NOT your workflow, then the results may include CONFLICT lines where
the interpretation you state is different from the interpretation in
See https://github.com/INCATools/boomer/issues/334
"""
writer = StreamingCsvWriter()
Expand All @@ -207,8 +291,17 @@ def compare(input_report, input_ontology: str, **kwargs):
ben = BoomerEngine()
ben.load(input_report)
for md in ben.compare(current_mappings, **kwargs):
t, pred, m, conf = md
writer.emit(dict(type=t.value, confidence=conf, predicate_id=pred, subject_id=m.subject_id, object_id=m.object_id))
t, info, m, conf = md
writer.emit(
dict(
type=t.value,
info=info,
confidence=conf,
predicate_id=m.predicate_id,
subject_id=m.subject_id,
object_id=m.object_id,
)
)


if __name__ == "__main__":
Expand Down
40 changes: 40 additions & 0 deletions tests/input/boomer-fake-go-example.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
## test good cluster
Identifier: test1
Method: exhaustive search
Score: -1.2417274456244485
Estimated probability: 0.8
Confidence: 0.95
Subsequent scores (max 10): -3.5294095180762297, -6.4093099761050984, -6.4093099761050984, -6.696992048556878, -6.696992048556878, -6.696992048556878, -6.696992048556878, -6.696992048556879, -6.696992048556879, -6.984674121008659

- [nucleus](GO:0005634) EquivalentTo [other nucleus](FAKE:1) (most probable) 0.4
- [nucleus](GO:0005634) SiblingOf [wikipedia nucleus](Wikipedia:Cell_nucleus) 0.4

## another good cluster
Identifier: test2
Method: exhaustive search
Score: -1.2417274456244485
Estimated probability: 0.8
Confidence: 0.95
Subsequent scores (max 10): -3.5294095180762297, -6.4093099761050984, -6.4093099761050984, -6.696992048556878, -6.696992048556878, -6.696992048556878, -6.696992048556878, -6.696992048556879, -6.696992048556879, -6.984674121008659

- [vacuole](GO:0005773) EquivalentTo [wikipedia vacuole](Wikipedia:Vacuole) 0.8

## test poor cluster
Identifier: test3
Method: exhaustive search
Score: -10.2417274456244485
Estimated probability: 0.1
Confidence: 0.1
Subsequent scores (max 10): -3.5294095180762297, -6.4093099761050984, -6.4093099761050984, -6.696992048556878, -6.696992048556878, -6.696992048556878, -6.696992048556878, -6.696992048556879, -6.696992048556879, -6.984674121008659

- [nucleus](GO:0005635) EquivalentTo [other nuclear envelope](Wikipedia:Nuclear_envelope) (most probable) 0.4
- [cytoplasm](GO:0005737) EquivalentTo [other cytoplasm](Wikipedia:Cytoplasm) (most probable) 0.4

## SINGLETONS
Method: singletons
Score: 0.0
Estimated probability: 1.0
Confidence: 1.0
Subsequent scores (max 10):


64 changes: 62 additions & 2 deletions tests/test_utilities/test_boomer_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import unittest

from oaklib.utilities.mapping.boomer_utils import BoomerEngine
from tests import INPUT_DIR
from oaklib import get_implementation_from_shorthand
from oaklib.interfaces import MappingProviderInterface
from oaklib.utilities.mapping.boomer_utils import BoomerEngine, DiffType
from tests import EXAMPLE_ONTOLOGY_DB, INPUT_DIR, NUCLEUS, VACUOLE

EXAMPLE = INPUT_DIR / "boomer-example.md"
GO_EXAMPLE = INPUT_DIR / "boomer-fake-go-example.md"


class TestBoomerUtils(unittest.TestCase):
Expand All @@ -24,3 +27,60 @@ def test_filter(self):
for m in ms:
print(m)
self.assertEqual(expected_n, len(ms))

def test_compare(self):
"""
tests BoomerEngine.compare against a fake GO example
"""
ben = BoomerEngine()
ben.load(GO_EXAMPLE)
adapter = get_implementation_from_shorthand(str(EXAMPLE_ONTOLOGY_DB))
if not isinstance(adapter, MappingProviderInterface):
raise AssertionError(f"{EXAMPLE_ONTOLOGY_DB} can't supply mappings")
current_mappings = list(adapter.all_sssom_mappings())
self.assertGreater(len(current_mappings), 10)
cases = [
(0.99, False, False, []),
(0.99, True, True, []),
(
0.75,
False,
False,
[
(DiffType.NEW, NUCLEUS, "FAKE:1"),
(DiffType.CONFLICT, NUCLEUS, "Wikipedia:Cell_nucleus"),
(DiffType.CONFLICT, VACUOLE, "Wikipedia:Vacuole"),
],
),
(
0.75,
True,
False,
[
(DiffType.NEW, NUCLEUS, "FAKE:1"),
(DiffType.REJECT, NUCLEUS, "Wikipedia:Cell_nucleus"),
(DiffType.CONFLICT, VACUOLE, "Wikipedia:Vacuole"),
],
),
(
0.75,
True,
True,
[
(DiffType.NEW, NUCLEUS, "FAKE:1"),
(DiffType.REJECT, NUCLEUS, "Wikipedia:Cell_nucleus"),
(DiffType.OK, VACUOLE, "Wikipedia:Vacuole"),
],
),
]
for minimum_confidence, reject_non_exact, promote_xref_to_exact, expected in cases:
results = list(
ben.compare(
current_mappings,
minimum_confidence=minimum_confidence,
reject_non_exact=reject_non_exact,
promote_xref_to_exact=promote_xref_to_exact,
)
)
result_tups = [(r[0], r[2].subject_id, r[2].object_id) for r in results]
self.assertCountEqual(expected, result_tups)

0 comments on commit b1e4a9b

Please sign in to comment.