Adding BoomerEngine.compare, addresses INCATools/boomer#334

INCATools · Jan 31, 2023 · b1e4a9b · b1e4a9b
1 parent e8699d7
commit b1e4a9b
Show file tree

Hide file tree

Showing 4 changed files with 228 additions and 27 deletions.
diff --git a/src/oaklib/parsers/boomer_parser.py b/src/oaklib/parsers/boomer_parser.py
@@ -16,19 +16,21 @@
 )
 from oaklib.parsers.parser_base import Parser
 
-re_new_block = re.compile(r"^## (.*)")
-re_tag_val = re.compile(r"^(\S+): (.*)")
-re_mapping1 = re.compile(r"^- (\[\S+)\s+(\w+)\s+(\S+\))\s+\(most probable\)\s+(\S+)$")
-re_mapping2 = re.compile(r"^- (\[\S+)\s+(\w+)\s+(\S+\))\s+(\S+)$")
-re_md_link = re.compile(r"^\[(.*)\]\((\S+)\)")
-
 BOOMER_TO_SKOS = {
     "SiblingOf": SKOS_CLOSE_MATCH,
     "EquivalentTo": SKOS_EXACT_MATCH,
     "ProperSubClassOf": SKOS_BROAD_MATCH,
     "ProperSuperClassOf": SKOS_NARROW_MATCH,
 }
 
+BOOMER_PRED_RE = "|".join(BOOMER_TO_SKOS.keys())
+
+re_new_block = re.compile(r"^## (.*)")
+re_tag_val = re.compile(r"^(\w.*):\s*(.*)")
+re_mapping1 = re.compile(rf"^- (\[.+)\s+({BOOMER_PRED_RE})\s+(.+\))\s+\(most probable\)\s+(\S+)$")
+re_mapping2 = re.compile(rf"^- (\[.+)\s+({BOOMER_PRED_RE})\s+(.+\))\s+(\S+)$")
+re_md_link = re.compile(r"^\[(.*)\]\((\S+)\)")
+
 
 @dataclass
 class BoomerParser(Parser):
@@ -62,6 +64,10 @@ def parse(self, file: TextIO) -> Iterator[MappingCluster]:
                     cluster.posterior_probability = float(val)
                 elif tag == "Confidence":
                     cluster.confidence = float(val)
+                elif tag.startswith("Subsequent"):
+                    pass
+                else:
+                    logging.warning(f"Unparsed tag {tag} in {line}")
                 continue
             m = re_mapping1.match(line)
             if not m:
@@ -85,6 +91,8 @@ def parse(self, file: TextIO) -> Iterator[MappingCluster]:
                 )
                 cluster.resolved_mappings.append(mapping)
                 continue
+            if line:
+                logging.warning(f"Cannot parse: {line}")
         if not cluster:
             logging.warning("No clusters in file")
             return

diff --git a/src/oaklib/utilities/mapping/boomer_utils.py b/src/oaklib/utilities/mapping/boomer_utils.py
@@ -1,23 +1,25 @@
 import logging
 from collections import defaultdict
+from copy import copy
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
-from typing import Iterator, List, Optional, TextIO, Union, Dict, Tuple
+from typing import Dict, Iterator, List, Optional, TextIO, Tuple, Union
 
 import click
 import sssom_schema as sssom
-from oaklib import get_implementation_from_shorthand
-from oaklib.interfaces import MappingProviderInterface
-from oaklib.io.streaming_csv_writer import StreamingCsvWriter
-from oaklib.types import CURIE, PRED_CURIE
-from sssom.constants import SEMAPV
+from sssom.constants import SEMAPV, SKOS_EXACT_MATCH
 
+from oaklib import get_implementation_from_shorthand
 from oaklib.datamodels.mapping_cluster_datamodel import (
     MappingCluster,
     MappingClusterReport,
 )
+from oaklib.datamodels.vocabulary import HAS_DBXREF, SKOS_CLOSE_MATCH
+from oaklib.interfaces import MappingProviderInterface
+from oaklib.io.streaming_csv_writer import StreamingCsvWriter
 from oaklib.parsers.boomer_parser import BoomerParser
+from oaklib.types import CURIE, PRED_CURIE
 from oaklib.utilities.mapping.sssom_utils import StreamingSssomWriter
 
 logger = logging.getLogger(__name__)
@@ -31,6 +33,7 @@ class DiffType(Enum):
     AMBIGUOUS = "AMBIGUOUS"
     NEW = "NEW"
     CONFLICT = "CONFLICT"
+    REJECT = "REJECT"
 
 
 MAPPING_DIFF = Tuple[DiffType, PRED_CURIE, sssom.Mapping, Optional[float]]
@@ -39,6 +42,7 @@ class DiffType(Enum):
 def _predicate_ids(mappings: List[sssom.Mapping]) -> List[CURIE]:
     return list(set([m.predicate_id for m in mappings]))
 
+
 @dataclass
 class BoomerEngine:
     """
@@ -108,7 +112,9 @@ def load(self, path: Union[TextIO, Path, str]) -> MappingClusterReport:
         self.report = MappingClusterReport(clusters=clusters)
         return self.report
 
-    def index_mappings(self, mappings: List[sssom.Mapping], max_per_pair: Optional[int] = None) -> MAPPING_SP_INDEX:
+    def index_mappings(
+        self, mappings: List[sssom.Mapping], max_per_pair: Optional[int] = None
+    ) -> MAPPING_SP_INDEX:
         mix: MAPPING_SP_INDEX = defaultdict(list)
         for m in mappings:
             pair = (m.subject_id, m.object_id)
@@ -117,8 +123,27 @@ def index_mappings(self, mappings: List[sssom.Mapping], max_per_pair: Optional[i
                 raise ValueError(f"Too many for {pair} => {mix[pair]}")
         return mix
 
-    def compare(self, current_mappings: List[sssom.Mapping], minimum_confidence: Optional[float] = 0.95) -> Iterator[MAPPING_DIFF]:
-        boomer_mapping_ix = self.index_mappings(list(self.mappings(minimum_confidence=minimum_confidence)), max_per_pair=1)
+    def compare(
+        self,
+        current_mappings: List[sssom.Mapping],
+        minimum_confidence: Optional[float] = 0.95,
+        reject_non_exact=False,
+        promote_xref_to_exact=False,
+        discard_new_close_matches=True,
+    ) -> Iterator[MAPPING_DIFF]:
+        """
+        Compares a set of pre-existing mappings with boomer resolved mappings.
+
+        :param current_mappings: source mappings to evaluate
+        :param minimum_confidence: any boomer resolved mapping that has confidence beneath this is ignored
+        :param reject_non_exact: if True, then any mapping that matches a confident resolved mapping is typed REJECT
+        :param promote_xref_to_exact: if True, then any xref in the source is treated as skos:exactMatch
+        :param discard_new_close_matches: if True, then do not suggest NEW lines for high confidence closeMatch
+        :return:
+        """
+        boomer_mapping_ix = self.index_mappings(
+            list(self.mappings(minimum_confidence=minimum_confidence)), max_per_pair=1
+        )
         current_mapping_ix = self.index_mappings(current_mappings)
         for pair, mappings in boomer_mapping_ix.items():
             [bm] = mappings
@@ -127,23 +152,36 @@ def compare(self, current_mappings: List[sssom.Mapping], minimum_confidence: Opt
                 current_mappings_for_pair = current_mapping_ix[pair]
                 if len(current_mappings_for_pair) > 1:
                     for m in current_mappings_for_pair:
-                        yield DiffType.AMBIGUOUS, m.predicate_id, m, None
+                        yield DiffType.AMBIGUOUS, None, m, None
                 for m in current_mappings_for_pair:
+                    if promote_xref_to_exact and m.predicate_id == HAS_DBXREF:
+                        m = copy(m)
+                        m.predicate_id = SKOS_EXACT_MATCH
                     if m.predicate_id == boomer_pred_id:
-                        yield DiffType.OK, m.predicate_id, m, bm.confidence
+                        yield DiffType.OK, None, m, bm.confidence
                     else:
-                        yield DiffType.CONFLICT, bm.predicate_id, m, bm.confidence
+                        if reject_non_exact and bm.predicate_id != SKOS_EXACT_MATCH:
+                            yield DiffType.REJECT, bm.predicate_id, m, bm.confidence
+                        else:
+                            yield DiffType.CONFLICT, bm.predicate_id, m, bm.confidence
             else:
-                yield DiffType.NEW, bm.predicate_id, bm, bm.confidence
+                if reject_non_exact and bm.predicate_id != SKOS_EXACT_MATCH:
+                    continue
+                if discard_new_close_matches and bm.predicate_id == SKOS_CLOSE_MATCH:
+                    continue
+                yield DiffType.NEW, None, bm, bm.confidence
 
 
 min_confidence_option = click.option(
     "--minimum-confidence",
     "-L",
     type=click.FLOAT,
+    default=0.95,
+    show_default=True,
     help="Do not show mappings with lower confidence",
 )
 
+
 @click.group()
 @click.option("-v", "--verbose", count=True)
 @click.option("-q", "--quiet")
@@ -178,7 +216,9 @@ def export(input_report, **kwargs):
     """
     Exports mappings from a boomer report.
 
-    boomerang export tests/input/boomer-example.md
+    Example:
+
+        boomerang export tests/input/boomer-example.md
     """
     ben = BoomerEngine()
     ben.load(input_report)
@@ -188,15 +228,59 @@ def export(input_report, **kwargs):
 
 
 @main.command()
-@click.option("--input-ontology",
-              "-i",
-              help="use OAK selector syntax")
+@click.option("--input-ontology", "-i", help="ontology from which to retrieve mappings")
+@click.option(
+    "--reject-non-exact/--no-reject-non-exact",
+    help="if set then any match to a high confidence boomer interpretation that is a reject.",
+)
+@click.option(
+    "--promote-xref-to-exact/--no-promote-xref-to-exact",
+    help="if set then any xref in the source is promoted to an EXACT.",
+)
 @min_confidence_option
 @click.argument("input_report")
 def compare(input_report, input_ontology: str, **kwargs):
     """
     Compares boomer results with existing mappings.
 
+    This assumes boomer has been executed in advance, and a markdown report generated.
+    Pass in as an argument the same ontology used in the boomer run.
+
+    Example:
+
+        boomerang foo-boomer.md -i foo.db
+
+    For any mapping marked NEW, this can be incorporated into the ontology.
+
+    For any mapping marked CONFLICT, there is some action that needs to be taken
+
+    By default any boomer resolved mapping beneath the default minimum confidence is ignored.
+    To customize, e.g. stringent:
+
+    Example:
+
+        boomerang foo-boomer.md -i foo.db -L 0.999
+
+    For each high confidence boomer mapping, this is compared against current mappings and
+    a suggestion made.
+
+    SPECIFIC SUGGESTIONS FOR OBO ONTOLOGIES:
+
+    In many ontologies it is conventional to (a) model all mappings as xrefs (b) assume
+    a default interpretation of exactMatch.
+
+    In these cases, we want to REJECT any existing xref IF there is a high confidence
+    boomer mapping FOR ANYTHING OTHER THAN exactMatch (including SiblingOf)
+
+    Example:
+
+        boomerang foo-boomer.md -i foo.db -L 0.999 --reject-non-exact --promote-xref-to-exact
+
+    The results here are straightforward, either REJECT, NEW, or OK
+
+    If this is NOT your workflow, then the results may include CONFLICT lines where
+    the interpretation you state is different from the interpretation in
+
     See https://github.com/INCATools/boomer/issues/334
     """
     writer = StreamingCsvWriter()
@@ -207,8 +291,17 @@ def compare(input_report, input_ontology: str, **kwargs):
     ben = BoomerEngine()
     ben.load(input_report)
     for md in ben.compare(current_mappings, **kwargs):
-        t, pred, m, conf = md
-        writer.emit(dict(type=t.value, confidence=conf, predicate_id=pred, subject_id=m.subject_id, object_id=m.object_id))
+        t, info, m, conf = md
+        writer.emit(
+            dict(
+                type=t.value,
+                info=info,
+                confidence=conf,
+                predicate_id=m.predicate_id,
+                subject_id=m.subject_id,
+                object_id=m.object_id,
+            )
+        )
 
 
 if __name__ == "__main__":

diff --git a/tests/input/boomer-fake-go-example.md b/tests/input/boomer-fake-go-example.md
@@ -0,0 +1,40 @@
+## test good cluster
+Identifier: test1
+Method: exhaustive search
+Score: -1.2417274456244485
+Estimated probability: 0.8
+Confidence: 0.95
+Subsequent scores (max 10): -3.5294095180762297, -6.4093099761050984, -6.4093099761050984, -6.696992048556878, -6.696992048556878, -6.696992048556878, -6.696992048556878, -6.696992048556879, -6.696992048556879, -6.984674121008659
+
+- [nucleus](GO:0005634) EquivalentTo [other nucleus](FAKE:1)	(most probable)	0.4
+- [nucleus](GO:0005634) SiblingOf [wikipedia nucleus](Wikipedia:Cell_nucleus)	0.4
+
+## another good cluster
+Identifier: test2
+Method: exhaustive search
+Score: -1.2417274456244485
+Estimated probability: 0.8
+Confidence: 0.95
+Subsequent scores (max 10): -3.5294095180762297, -6.4093099761050984, -6.4093099761050984, -6.696992048556878, -6.696992048556878, -6.696992048556878, -6.696992048556878, -6.696992048556879, -6.696992048556879, -6.984674121008659
+
+- [vacuole](GO:0005773) EquivalentTo [wikipedia vacuole](Wikipedia:Vacuole)	0.8
+
+## test poor cluster
+Identifier: test3
+Method: exhaustive search
+Score: -10.2417274456244485
+Estimated probability: 0.1
+Confidence: 0.1
+Subsequent scores (max 10): -3.5294095180762297, -6.4093099761050984, -6.4093099761050984, -6.696992048556878, -6.696992048556878, -6.696992048556878, -6.696992048556878, -6.696992048556879, -6.696992048556879, -6.984674121008659
+
+- [nucleus](GO:0005635) EquivalentTo [other nuclear envelope](Wikipedia:Nuclear_envelope)	(most probable)	0.4
+- [cytoplasm](GO:0005737) EquivalentTo [other cytoplasm](Wikipedia:Cytoplasm)	(most probable)	0.4
+
+## SINGLETONS
+Method: singletons
+Score: 0.0
+Estimated probability: 1.0
+Confidence: 1.0
+Subsequent scores (max 10): 
+
+
diff --git a/tests/test_utilities/test_boomer_utils.py b/tests/test_utilities/test_boomer_utils.py
@@ -1,9 +1,12 @@
 import unittest
 
-from oaklib.utilities.mapping.boomer_utils import BoomerEngine
-from tests import INPUT_DIR
+from oaklib import get_implementation_from_shorthand
+from oaklib.interfaces import MappingProviderInterface
+from oaklib.utilities.mapping.boomer_utils import BoomerEngine, DiffType
+from tests import EXAMPLE_ONTOLOGY_DB, INPUT_DIR, NUCLEUS, VACUOLE
 
 EXAMPLE = INPUT_DIR / "boomer-example.md"
+GO_EXAMPLE = INPUT_DIR / "boomer-fake-go-example.md"
 
 
 class TestBoomerUtils(unittest.TestCase):
@@ -24,3 +27,60 @@ def test_filter(self):
             for m in ms:
                 print(m)
             self.assertEqual(expected_n, len(ms))
+
+    def test_compare(self):
+        """
+        tests BoomerEngine.compare against a fake GO example
+        """
+        ben = BoomerEngine()
+        ben.load(GO_EXAMPLE)
+        adapter = get_implementation_from_shorthand(str(EXAMPLE_ONTOLOGY_DB))
+        if not isinstance(adapter, MappingProviderInterface):
+            raise AssertionError(f"{EXAMPLE_ONTOLOGY_DB} can't supply mappings")
+        current_mappings = list(adapter.all_sssom_mappings())
+        self.assertGreater(len(current_mappings), 10)
+        cases = [
+            (0.99, False, False, []),
+            (0.99, True, True, []),
+            (
+                0.75,
+                False,
+                False,
+                [
+                    (DiffType.NEW, NUCLEUS, "FAKE:1"),
+                    (DiffType.CONFLICT, NUCLEUS, "Wikipedia:Cell_nucleus"),
+                    (DiffType.CONFLICT, VACUOLE, "Wikipedia:Vacuole"),
+                ],
+            ),
+            (
+                0.75,
+                True,
+                False,
+                [
+                    (DiffType.NEW, NUCLEUS, "FAKE:1"),
+                    (DiffType.REJECT, NUCLEUS, "Wikipedia:Cell_nucleus"),
+                    (DiffType.CONFLICT, VACUOLE, "Wikipedia:Vacuole"),
+                ],
+            ),
+            (
+                0.75,
+                True,
+                True,
+                [
+                    (DiffType.NEW, NUCLEUS, "FAKE:1"),
+                    (DiffType.REJECT, NUCLEUS, "Wikipedia:Cell_nucleus"),
+                    (DiffType.OK, VACUOLE, "Wikipedia:Vacuole"),
+                ],
+            ),
+        ]
+        for minimum_confidence, reject_non_exact, promote_xref_to_exact, expected in cases:
+            results = list(
+                ben.compare(
+                    current_mappings,
+                    minimum_confidence=minimum_confidence,
+                    reject_non_exact=reject_non_exact,
+                    promote_xref_to_exact=promote_xref_to_exact,
+                )
+            )
+            result_tups = [(r[0], r[2].subject_id, r[2].object_id) for r in results]
+            self.assertCountEqual(expected, result_tups)