Skip to content

Commit

Permalink
Bugfixes: match-mondo-sources-all-lexical.py
Browse files Browse the repository at this point in the history
- Bugfix: AttributeError: 'tuple' object has no attribute 'pop': wrong datatype for metadata was being passed to lexical_index_to_sssom()
- Bugfix: Several other bugs in mondo-ingest, and upgrading OAK/sssom-py/curies to fix other bugs related to prefix maps.
- Update: mondo.sssom.config.yml: Commented out duplicate prefix 'oio'
- Update: prefixes.csv: Removed duplicate prefix oio
- Update: Python requirements: Upgraded curies for bugfix involving get_prefixes(include_synonyms)
  • Loading branch information
joeflack4 committed Jan 19, 2024
1 parent 76771d7 commit 63adbb6
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 18 deletions.
2 changes: 1 addition & 1 deletion python-requirements-apple-silicon.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ charset-normalizer==3.3.2
class-resolver==0.4.2
click==8.1.7
colorama==0.4.6
curies==0.7.4
curies==0.7.6
Deprecated==1.2.14
deprecation==2.1.0
distlib==0.3.7
Expand Down
2 changes: 1 addition & 1 deletion python-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class-resolver==0.4.2
click==8.1.7
colorama==0.4.6
commonmark==0.9.1
curies==0.6.4
curies==0.7.6
decorator==5.1.1
Deprecated==1.2.13
deprecation==2.1.0
Expand Down
2 changes: 1 addition & 1 deletion src/ontology/config/prefixes.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ rdf,http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs,http://www.w3.org/2000/01/rdf-schema#
xsd,http://www.w3.org/2001/XMLSchema#
owl,http://www.w3.org/2002/07/owl#
oio,http://www.geneontology.org/formats/oboInOwl#
oboInOwl,http://www.geneontology.org/formats/oboInOwl#
dce,http://purl.org/dc/elements/1.1/
dct,http://purl.org/dc/terms/
foaf,http://xmlns.com/foaf/0.1/
Expand Down
4 changes: 2 additions & 2 deletions src/ontology/metadata/mondo.sssom.config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ curie_map:
semapv: https://w3id.org/semapv/vocab/
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
sssom: https://w3id.org/sssom/
oio: http://www.geneontology.org/formats/oboInOwl#
# oio: http://www.geneontology.org/formats/oboInOwl#
GTR: "http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GTR/"
NCI: "http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/NCI/"
NIFSTD: "http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/NIFSTD/"
Expand Down Expand Up @@ -114,7 +114,7 @@ extended_prefix_map:
prefix_synonyms: []
uri_prefix: http://identifiers.org/snomedct/
uri_prefix_synonyms:
= http://snomed.info/id/
- http://snomed.info/id/
- prefix: OMIM
prefix_synonyms: []
uri_prefix: https://omim.org/entry/
Expand Down
80 changes: 67 additions & 13 deletions src/scripts/match-mondo-sources-all-lexical.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@
# Use oak.mapping() pipeline

import logging
from collections import ChainMap
from datetime import datetime
from pathlib import Path
from typing import List, Tuple, Union

from curies import Converter
from oaklib.resource import OntologyResource
from oaklib.implementations.sqldb.sql_implementation import SqlImplementation
from oaklib.utilities.lexical.lexical_indexer import (
Expand All @@ -25,11 +30,12 @@
import yaml
import pandas as pd

from sssom.constants import SUBJECT_ID, OBJECT_ID, PREDICATE_MODIFIER
from sssom.constants import MetadataType, SUBJECT_ID, OBJECT_ID, PREDICATE_MODIFIER, get_default_metadata
from sssom.context import get_converter
from sssom.util import filter_prefixes, is_curie, is_iri
from sssom.parsers import parse_sssom_table
from sssom.writers import write_table
from sssom.io import get_metadata_and_prefix_map, filter_file
from sssom.io import _merge_converter, get_metadata_and_prefix_map, filter_file
from bioregistry import curie_from_iri

SRC = Path(__file__).resolve().parents[1]
Expand All @@ -49,6 +55,26 @@
)


# todo: Harshad initially set up to have this return metadata but I don't see it being used. Should just instantiate
# simple converter? maybe even not do as a function?
def get_converter_and_metadata(metadata_path: Union[None, str, Path] = None) -> Tuple[Converter, MetadataType]:
"""
Load SSSOM metadata from a YAML file, and then augment it with default prefixes.
:param metadata_path: The metadata file in YAML format
"""
if metadata_path is None:
return get_converter(), get_default_metadata()

with Path(metadata_path).resolve().open() as file:
metadata = yaml.safe_load(file)

metadata = dict(ChainMap(metadata, get_default_metadata()))
converter = Converter.from_extended_prefix_map(metadata.pop('extended_prefix_map', {}))
converter = _merge_converter(converter)
return converter, metadata


@click.group()
@click.option("-v", "--verbose", count=True)
@click.option("-q", "--quiet")
Expand Down Expand Up @@ -83,18 +109,30 @@ def main(verbose: int, quiet: bool):
)
@output_option
def run(input: str, config: str, rules: str, rejects: str, output: str):
# Implemented `meta` param in `lexical_index_to_sssom`

meta = get_metadata_and_prefix_map(config)
t0 = datetime.now() # todo: temp
# TODO 01/17: this will be a private method. can i simply create converter (and meta if needed) here?
# - a. if old code passes now, possibly keep it or a local copy that does normal curie_map
# - b. if old code passes, keep using EPM even though this could cause need for include_synonyms
# old code
# converter, meta = get_metadata_and_prefix_map(config)
# new code
converter, meta = get_converter_and_metadata(config)

# todo's <01/17
# todo temp: if lexical_index_to_sssom() is actually in need of passing 'meta', pass msdf_meta? But causes error:
# ValueError: Unknown argument: curie_map = ...
# msdf_meta = {'curie_map': converter.prefix_map}
with open(config, "r") as f:
yml = yaml.safe_load(f)

# Get mondo.sssom.tsv
# TODO: <uncomment
mapping_msdf = parse_sssom_table(SSSOM_MAP_FILE)
reject_df = pd.read_csv(
rejects, sep="\t", index_col=None
)
mapping_msdf.df = pd.concat([mapping_msdf.df, reject_df])[mapping_msdf.df.columns].drop_duplicates()
# TODO: /uncomment>
# mapping_msdf.df = (
# pd.merge(
# mapping_msdf.df,
Expand All @@ -108,19 +146,31 @@ def run(input: str, config: str, rules: str, rejects: str, output: str):
# .reset_index(drop=True)
# )

prefix_of_interest = yml["subject_prefixes"]

resource = OntologyResource(slug=f"sqlite:///{Path(input).absolute()}")
oi = SqlImplementation(resource=resource)
ruleset = load_mapping_rules(rules)
# syn_rules = [x.synonymizer for x in ruleset.rules if x.synonymizer]
lexical_index = create_lexical_index(oi=oi, mapping_rule_collection=ruleset)
save_lexical_index(lexical_index, OUT_INDEX_DB)

# TODO: uncomment
# lexical_index = create_lexical_index(oi=oi, mapping_rule_collection=ruleset)
# save_lexical_index(lexical_index, OUT_INDEX_DB)
# TODO: /uncomment

# TODO temp delete after
import pickle
pp = '/Users/joeflack4/projects/mondo-ingest/cache/issues/lexmatch/mondo-ingest/lexical_index.pickle/b4_after_remove_oio_prefixes_csv/after/lexical_index.pickle'
# pickle.dump(lexical_index, open(pp, "wb"))
lexical_index = pickle.load(open(pp, "rb"))

t0_2 = datetime.now() # todo: temp
if rules:
msdf = lexical_index_to_sssom(oi, lexical_index, ruleset=ruleset, meta=meta)
# todo: backup: remove when done
# epm: List = converter.records
# msdf = lexical_index_to_sssom(oi, lexical_index, ruleset=ruleset, extended_prefix_map=epm)
msdf = lexical_index_to_sssom(oi, lexical_index, ruleset=ruleset, prefix_map=converter)
else:
msdf = lexical_index_to_sssom(oi, lexical_index, meta=meta)
msdf = lexical_index_to_sssom(oi, lexical_index)
t1_2 = datetime.now() # todo: temp
print('lexical_index_to_sssom complete in seconds:', (t1_2 - t0_2).seconds) # todo temp

# msdf.prefix_map = sssom_yaml['curie_map']
# msdf.metadata = sssom_yaml['global_metadata']
Expand All @@ -131,8 +181,9 @@ def run(input: str, config: str, rules: str, rejects: str, output: str):
# msdf.df[OBJECT_ID] = msdf.df[OBJECT_ID].apply(
# lambda x: iri_to_curie(x) if x.startswith("<http") else x
# )
prefixes_of_interest = yml["subject_prefixes"]
msdf.df = filter_prefixes(
df=msdf.df, filter_prefixes=prefix_of_interest, features=[SUBJECT_ID, OBJECT_ID]
df=msdf.df, filter_prefixes=prefixes_of_interest, features=[SUBJECT_ID, OBJECT_ID]
)
msdf.remove_mappings(mapping_msdf)

Expand All @@ -145,6 +196,9 @@ def run(input: str, config: str, rules: str, rejects: str, output: str):
kwargs = {"subject_id": ("MONDO:%",), "object_id": prefix_args}
with open(str(Path(output.replace("lexical", "lexical-2"))), "w") as f:
filter_file(input=str(Path(output)), output=f, **kwargs)
t1 = datetime.now() # todo: temp
print('match-mondo-sources-all-lexical complete in seconds:', (t1 - t0).seconds) # todo temp
print()


def iri_to_curie(item):
Expand Down

0 comments on commit 63adbb6

Please sign in to comment.