diff --git a/src/ontology/mondo-ingest.Makefile b/src/ontology/mondo-ingest.Makefile index d6a716a86..04de8b4a6 100644 --- a/src/ontology/mondo-ingest.Makefile +++ b/src/ontology/mondo-ingest.Makefile @@ -3,7 +3,7 @@ ## If you need to customize your Makefile, make ## changes here rather than in the main Makefile .PHONY: deploy-mondo-ingest build-mondo-ingest documentation mappings update-jinja-sparql-queries \ -report-mapping-annotations python-install-dependencies +report-mapping-annotations python-install-dependencies slurp-all slurp-% #################################### ### Standard constants ############# @@ -157,7 +157,7 @@ metadata/mondo.sssom.config.yml: mappings: sssom $(ALL_MAPPINGS) ################# -# Utils ######### +##### Utils ##### ################# # Documentation for `report-mapping-annotations` and `update-jinja-sparql-queries`: `docs/developer/ordo.md` # TODO: When https://github.com/monarch-initiative/mondo-ingest/issues/43 is fixed, can change back to `requirements.txt` @@ -172,6 +172,9 @@ update-jinja-sparql-queries: python-install-dependencies python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_replace_annotation_based_mappings.py python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_mapping_annotations_violation.py +################# +## Exclusions ### +################# config/%_term_exclusions.txt: config/%_exclusions.tsv component-download-%.owl $(REPORTDIR)/mirror_signature-%.tsv $(REPORTDIR)/component_signature-%.tsv metadata/%.yml python-install-dependencies python3 $(SCRIPTSDIR)/exclusion_term_expansion.py \ --onto-name $* \ @@ -237,7 +240,7 @@ tmp/mondo.sssom.tsv: wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@ tmp/mondo.sssom.ttl: tmp/mondo.sssom.tsv - sssom convert $< -O rdf -o $@ + sssom convert $< -O rdf -o $@tmp/mondo.ow # Merge Mondo, precise mappings and mondo-ingest into one coherent whole for the purpose of querying. @@ -250,6 +253,7 @@ $(REPORTDIR)/mondo_ordo_unsupported_subclass.tsv: ../sparql/mondo-ordo-unsupport .PHONY: mondo-ordo-subclass mondo-ordo-subclass: $(REPORTDIR)/mondo_ordo_unsupported_subclass.tsv +# TODO: I believe we'll need a special case/goal for `reports/mirror_signature-mondo.tsv`, which will depend on `tmp/mondo.owl` instead. reports/mirror_signature-%.tsv: component-download-%.owl $(ROBOT) query -i $(TMPDIR)/$<.owl --query ../sparql/classes.sparql $@ @@ -278,19 +282,30 @@ lexical_matches: mappings/mondo-sources-all-lexical.sssom.tsv ############################# ###### Slurp pipeline ####### ############################# +# TODO: (a) Move this to Makefile, or (b) refactor this away. +.PHONY: component-download-mondo.owl +component-download-mondo.owl: | $(TMPDIR) + if [ $(MIR) = true ] && [ $(COMP) = true ]; then $(ROBOT) merge -I http://purl.obolibrary.org/obo/mondo.owl \ + annotate --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) -o $(TMPDIR)/$@.owl; fi + +# TODO: (a) Move this to Makefile, or (b) refactor this away, or (c) DELETE this goal. +# ...I think (c) is most likely, as `reports/mirror_signature-%.tsv` needs `component-download-%.owl`. +# ...`reports/component_signature-%.tsv` is what needs `components/%.owl`, but that shouldn't be needed here. +# $(COMPONENTSDIR)/mondo.owl: component-download-mondo.owl +# if [ $(COMP) = true ] ; then if cmp -s $(TMPDIR)/component-download-mondo.owl.owl $@ ; then echo "Component identical."; else echo "Component is different, updating." && cp $(TMPDIR)/component-download-mondo.owl.owl $@; fi; fi + slurp/: mkdir -p $@ -# Feel free to change the signature. Min ID is the next available Mondo ID. -slurp/%.tsv: components/%.owl tmp/mondo.sssom.tsv reports/mirror-signature-mondo.tsv | slurp/ +# min-id: the next available Mondo ID +slurp/%.tsv: $(COMPONENTSDIR)/%.owl $(TMPDIR)/mondo.sssom.tsv $(REPORTDIR)/mirror_signature-mondo.tsv | slurp/ python $(SCRIPTSDIR)/migrate.py \ - -i $< \ - --mapping-file tmp/mondo.sssom.tsv \ + --ontology-path $(COMPONENTSDIR)/$*.owl \ + --sssom-map-path $(TMPDIR)/mondo.sssom.tsv \ --min-id 123000 \ - --mondo-terms reports/mirror-signature-mondo.tsv \ - --output $@ + --mondo-terms-path $(REPORTDIR)/mirror_signature-mondo.tsv \ + --outpath $@ slurp-%: slurp/%.tsv -# TODO: add more ontologies, e.g.: doid, icd10cm, icd10who, ncit, ordo -slurp: slurp-omim +slurp-all: slurp-omim slurp-doid slurp-ncit slurp-ordo slurp-icd10cm slurp-icd10who diff --git a/src/scripts/migrate.py b/src/scripts/migrate.py index 97368979f..173b288f1 100644 --- a/src/scripts/migrate.py +++ b/src/scripts/migrate.py @@ -1,54 +1,138 @@ -"""Migration pipeline +"""Slurp migration pipeline -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING - -TODOs: - - add CLI: look to makefile for what to include +TODO's: + - """ -import oakliblib -import pandas +import os.path +from argparse import ArgumentParser +from typing import Dict, List + +import pandas as pd +import oaklib +from oaklib.implementations import ProntoImplementation +from oaklib.resource import OntologyResource + + +# TODO: implement this func: +# todo: IDs should be int or str? prolly str +def determine_next_available_mondo_id(min_id: str, mondo_termlist_df: pd.DataFrame) -> str: + """Starting from `min_id`, count up and check until finding the next ID.""" + next_id = str(0) + return next_id + +def _get_direct_parents(curie: str, ontology: ProntoImplementation): + """Get CURIEs of parents of a class + todo: `ontology` typing: should probably be typed agnostic of the implementation. But it inherits from so + much stuff, I'm not sure which if any of these to use. I was expecting `BasicOntologyInterface`, but it doesn't + inherit from that: RelationGraphInterface, OboGraphInterface, ValidatorInterface, SearchInterface, + SubsetterInterface, MappingProviderInterface, PatcherInterface, SemanticSimilarityInterface, MetadataInterface, + DifferInterface, ABC + """ + # TODO: How to get just the *direct* parents? (a) func for that? (b) look at relationships and see if subClass? + # rels = ontology.get_outgoing_relationships_by_curie(curie) # works for ProntoImplementation + rels = ontology.relationships([curie]) + p = [] + for rel, parents in rels.items(): + # print(f' {rel} ! {ontology.get_label_by_curie(rel)}') + for parent in parents: + # print(f' {parent} ! {ontology.get_label_by_curie(parent)}') + p.append(parent) + return p -#Inputs: -source_ontology = '' #e.g. omim -sssom_map = '' # e.g. mondo.sssom.tsv -min_id = '' -termlist_mondo = '' +def _get_all_term_curies(ontology: ProntoImplementation, is_omim=False) -> List[str]: + """Get all terms as CURIEs + todo: Will we need prefix_map from our config `.yml`s? Or can OAK handle automatically? + - reasons we might need: (i) filter correct terms, (ii) convert URI to CURIE if get URI + """ + # todo: if oak can't handle automatically, I think we'll need `prefix_map`: + terms = [x for x in ontology.entities()] + # todo: temp until I figure out if (a) we need `prefix_map`, or (b) OAK can somehow handle automatically: + if is_omim: + terms = [x for x in terms if any([x.startswith(y) for y in [ + 'OMIM:', 'OMIMPS:', 'https://omim.org/entry/', 'https://omim.org/phenotypicSeries/PS']])] + # prefix_map = { + # 'OMIM': 'https://omim.org/entry/', + # 'OMIMPS': 'https://omim.org/phenotypicSeries/PS', + # } + # todo: convert to CURIE using charlie's new lib + return terms -def run(source_ontology = '', sssom_map = '', min_id = '', termlist_mondo = ''): - """source_ontology = '' #e.g. omim - sssom_map = '' # e.g. mondo.sssom.tsv - min_id = '' - termlist_mondo = ''""" - #Outputs: - data = [] - for t in source_ontology: - if t not in sssom_map['object_id']: - parents = [] +def run(ontology_path: str, sssom_map_path: str, min_id: str, mondo_terms_path: str, outpath: str) -> pd.DataFrame: + """Run slurp pipeline for given ontology""" + # Read source files + # todo: `ontology`: If trouble, can try `SparqlImplementation`, but ~6 min to load and queries slow (cuz rdflib) + ontology = ProntoImplementation(OntologyResource(slug=ontology_path, local=True)) + sssom_df = pd.read_csv(sssom_map_path, comment='#', sep='\t') + mondo_termlist_df = pd.read_csv(mondo_terms_path, comment='#', sep='\t') + + # Initialize variables + # todo: is_omim is temporary + source_onto_terms: List[str] = _get_all_term_curies(ontology, is_omim='omim' in os.path.basename(ontology_path)) + sssom_object_ids = set(sssom_df['object_id']) + + # Get slurpable terms + slurpable_terms = [] + for t in source_onto_terms: + if t not in sssom_object_ids: migrate = True - for p in oaklib.get_direct_parents(t): - if p not in sssom_map['object_id']: + # todo: Find the correct way of doing this: + # todo: probably best to add all t[] at once, given the OAK api + direct_parents: List[str] = _get_direct_parents(t, ontology) + slurpable_parents: List[str] = [] + for parent in direct_parents: + if parent not in sssom_object_ids: migrate = False break - elif sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:exactMatch' \ - or sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:narrowMatch': - # In other words, if the parent is mapped, and the mapping is either exact or narrower - parents.append(sssom_map[sssom_map['object_id']==p]['subject_id']) else: - # Its fine, just continue looking for other parents in this case - if migrate and parents: - next_mondo_id = determine_next_available_mondo_id(min_id, termlist_mondo) # satrting from min_id, then counting up and checking if it does not already exist. + obj_data = sssom_df[sssom_df['object_id'] == parent] + pred = str(obj_data['predicate_id']) + if pred in ['skos:exactMatch', 'skos:narrowMatch']: + # In other words, if the parent is mapped, and the mapping is either exact or narrower, OK to add + # todo: Ok to add + slurpable_parents.append(obj_data['subject_id']) + else: + pass # Its fine, just continue looking for other parents in this case + if migrate and slurpable_parents: + # TODO: implement this func: + next_mondo_id = determine_next_available_mondo_id(min_id, mondo_termlist_df) # satrting from min_id, then counting up and checking if it does not already exist. + # todo: Find the correct way of doing this: label = oaklib.get_label(t) + # todo: Find the correct way of doing this: definition = oaklib.get_definition(t) - data.append({'mondo_id':next_mondo_id, 'xref': t, 'label': label, 'definition': definition}) + slurpable_terms.append({'mondo_id': next_mondo_id, 'xref': t, 'label': label, 'definition': definition}) + + result = pd.DataFrame(slurpable_terms) + result.to_csv(outpath, sep="\t") + return result + - pandas.DataFrame(data).to_csv(fn, sep="\t") +def cli(): + """Command line interface.""" + package_description = \ + 'Slurp pipeline: Integrate new terms from other ontologies into Mondo.' + parser = ArgumentParser(description=package_description) + parser.add_argument( + '-o', '--ontology-path', required=True, + help='Path to ontology file, e.g. an `.owl` file.') + parser.add_argument( + '-m', '--sssom-map-path', required=True, + help='Path to file containing all known Mondo mappings, in SSSOM format.') + parser.add_argument( + '-i', '--min-id', required=True, + help='The ID from which we want to begin searching from in order to locate any currently unslurped terms.') + parser.add_argument( + '-t', '--mondo-terms-path', required=True, + help='Path to a file that contains a list of all Mondo terms.') + parser.add_argument( + '-O', '--outpath', required=True, + help='Path to save the output slurp `.tsv` file, containing list of new terms to integrate into Mondo.') + d: Dict = vars(parser.parse_args()) + # todo: Convert paths to absolute paths, as I've done before? Or expect always be run from src/ontology and ok? + run(**d) if __name__ == '__main__': - run() + cli()