From 9c9f296f0620abe456e7646333458e863613cc48 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Wed, 3 Aug 2022 16:10:53 -0400 Subject: [PATCH] Slurp pipeline - Add: makefile: Missing make goals for dependencies for slurp goal. - Add: Python: CLI - Update: makefile: Slurp goal: (i) named keys/vals for all params, (ii) standardization in file/path params. - Update: Python: Completed script, inspired by initial psuedo code. (WIP) --- src/ontology/mondo-ingest.Makefile | 37 +++++++--- src/ontology/slurp/omim.tsv | 1 + src/scripts/migrate.py | 110 ++++++++++++++++++++--------- 3 files changed, 102 insertions(+), 46 deletions(-) create mode 100644 src/ontology/slurp/omim.tsv diff --git a/src/ontology/mondo-ingest.Makefile b/src/ontology/mondo-ingest.Makefile index d6a716a86..04de8b4a6 100644 --- a/src/ontology/mondo-ingest.Makefile +++ b/src/ontology/mondo-ingest.Makefile @@ -3,7 +3,7 @@ ## If you need to customize your Makefile, make ## changes here rather than in the main Makefile .PHONY: deploy-mondo-ingest build-mondo-ingest documentation mappings update-jinja-sparql-queries \ -report-mapping-annotations python-install-dependencies +report-mapping-annotations python-install-dependencies slurp-all slurp-% #################################### ### Standard constants ############# @@ -157,7 +157,7 @@ metadata/mondo.sssom.config.yml: mappings: sssom $(ALL_MAPPINGS) ################# -# Utils ######### +##### Utils ##### ################# # Documentation for `report-mapping-annotations` and `update-jinja-sparql-queries`: `docs/developer/ordo.md` # TODO: When https://github.com/monarch-initiative/mondo-ingest/issues/43 is fixed, can change back to `requirements.txt` @@ -172,6 +172,9 @@ update-jinja-sparql-queries: python-install-dependencies python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_replace_annotation_based_mappings.py python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_mapping_annotations_violation.py +################# +## Exclusions ### +################# config/%_term_exclusions.txt: config/%_exclusions.tsv component-download-%.owl $(REPORTDIR)/mirror_signature-%.tsv $(REPORTDIR)/component_signature-%.tsv metadata/%.yml python-install-dependencies python3 $(SCRIPTSDIR)/exclusion_term_expansion.py \ --onto-name $* \ @@ -237,7 +240,7 @@ tmp/mondo.sssom.tsv: wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@ tmp/mondo.sssom.ttl: tmp/mondo.sssom.tsv - sssom convert $< -O rdf -o $@ + sssom convert $< -O rdf -o $@tmp/mondo.ow # Merge Mondo, precise mappings and mondo-ingest into one coherent whole for the purpose of querying. @@ -250,6 +253,7 @@ $(REPORTDIR)/mondo_ordo_unsupported_subclass.tsv: ../sparql/mondo-ordo-unsupport .PHONY: mondo-ordo-subclass mondo-ordo-subclass: $(REPORTDIR)/mondo_ordo_unsupported_subclass.tsv +# TODO: I believe we'll need a special case/goal for `reports/mirror_signature-mondo.tsv`, which will depend on `tmp/mondo.owl` instead. reports/mirror_signature-%.tsv: component-download-%.owl $(ROBOT) query -i $(TMPDIR)/$<.owl --query ../sparql/classes.sparql $@ @@ -278,19 +282,30 @@ lexical_matches: mappings/mondo-sources-all-lexical.sssom.tsv ############################# ###### Slurp pipeline ####### ############################# +# TODO: (a) Move this to Makefile, or (b) refactor this away. +.PHONY: component-download-mondo.owl +component-download-mondo.owl: | $(TMPDIR) + if [ $(MIR) = true ] && [ $(COMP) = true ]; then $(ROBOT) merge -I http://purl.obolibrary.org/obo/mondo.owl \ + annotate --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) -o $(TMPDIR)/$@.owl; fi + +# TODO: (a) Move this to Makefile, or (b) refactor this away, or (c) DELETE this goal. +# ...I think (c) is most likely, as `reports/mirror_signature-%.tsv` needs `component-download-%.owl`. +# ...`reports/component_signature-%.tsv` is what needs `components/%.owl`, but that shouldn't be needed here. +# $(COMPONENTSDIR)/mondo.owl: component-download-mondo.owl +# if [ $(COMP) = true ] ; then if cmp -s $(TMPDIR)/component-download-mondo.owl.owl $@ ; then echo "Component identical."; else echo "Component is different, updating." && cp $(TMPDIR)/component-download-mondo.owl.owl $@; fi; fi + slurp/: mkdir -p $@ -# Feel free to change the signature. Min ID is the next available Mondo ID. -slurp/%.tsv: components/%.owl tmp/mondo.sssom.tsv reports/mirror-signature-mondo.tsv | slurp/ +# min-id: the next available Mondo ID +slurp/%.tsv: $(COMPONENTSDIR)/%.owl $(TMPDIR)/mondo.sssom.tsv $(REPORTDIR)/mirror_signature-mondo.tsv | slurp/ python $(SCRIPTSDIR)/migrate.py \ - -i $< \ - --mapping-file tmp/mondo.sssom.tsv \ + --ontology-path $(COMPONENTSDIR)/$*.owl \ + --sssom-map-path $(TMPDIR)/mondo.sssom.tsv \ --min-id 123000 \ - --mondo-terms reports/mirror-signature-mondo.tsv \ - --output $@ + --mondo-terms-path $(REPORTDIR)/mirror_signature-mondo.tsv \ + --outpath $@ slurp-%: slurp/%.tsv -# TODO: add more ontologies, e.g.: doid, icd10cm, icd10who, ncit, ordo -slurp: slurp-omim +slurp-all: slurp-omim slurp-doid slurp-ncit slurp-ordo slurp-icd10cm slurp-icd10who diff --git a/src/ontology/slurp/omim.tsv b/src/ontology/slurp/omim.tsv new file mode 100644 index 000000000..e16c76dff --- /dev/null +++ b/src/ontology/slurp/omim.tsv @@ -0,0 +1 @@ +"" diff --git a/src/scripts/migrate.py b/src/scripts/migrate.py index 97368979f..caa1a699e 100644 --- a/src/scripts/migrate.py +++ b/src/scripts/migrate.py @@ -1,54 +1,94 @@ -"""Migration pipeline +"""Slurp migration pipeline -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING - -TODOs: +TODO's: - add CLI: look to makefile for what to include """ -import oakliblib -import pandas +import os +from argparse import ArgumentParser +from typing import Dict, List +import pandas as pd +import oaklib +from oaklib.resource import OntologyResource +from oaklib.implementations.sqldb.sql_implementation import SqlImplementation -#Inputs: -source_ontology = '' #e.g. omim -sssom_map = '' # e.g. mondo.sssom.tsv -min_id = '' -termlist_mondo = '' +# TODO: implement this func: +# todo: IDs should be int or str? prolly str +def determine_next_available_mondo_id(min_id: str, mondo_termlist_df: pd.DataFrame) -> str: + """Starting from `min_id`, count up and check until finding the next ID.""" + next_id = str(0) + return next_id -def run(source_ontology = '', sssom_map = '', min_id = '', termlist_mondo = ''): - """source_ontology = '' #e.g. omim - sssom_map = '' # e.g. mondo.sssom.tsv - min_id = '' - termlist_mondo = ''""" - #Outputs: - data = [] - for t in source_ontology: - if t not in sssom_map['object_id']: - parents = [] +def run(ontology_path: str, sssom_map_path: str, min_id: str, mondo_terms_path: str, outpath: str) -> pd.DataFrame: + """Run slurp pipeline for given ontology""" + # TODO: read this with OAK + ontology = SqlImplementation(OntologyResource(slug=ontology_path, local=True)) + sssom_df = pd.read_csv(sssom_map_path, comment='#', sep='\t') + # TODO: Need to get the mondo terms, but ran out of memory on my alternate PC. get from other PC. + print(f'exists: {mondo_terms_path}: ', os.path.exists(os.path.join(os.getcwd(), mondo_terms_path))) + mondo_termlist_df = pd.read_csv(mondo_terms_path, comment='#', sep='\t') + + source_onto_terms: List[str] = [] + sssom_object_ids = set(sssom_df['object_id']) + + data = [] + for t in source_onto_terms: + if t not in sssom_object_ids: migrate = True - for p in oaklib.get_direct_parents(t): - if p not in sssom_map['object_id']: + # todo: Find the correct way of doing this: + parents: List[str] = oaklib.get_direct_parents(t) + for parent in parents: + if parent not in sssom_object_ids: migrate = False break - elif sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:exactMatch' \ - or sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:narrowMatch': - # In other words, if the parent is mapped, and the mapping is either exact or narrower - parents.append(sssom_map[sssom_map['object_id']==p]['subject_id']) else: - # Its fine, just continue looking for other parents in this case + obj_data = sssom_df[sssom_df['object_id'] == parent] + pred = str(obj_data['predicate_id']) + if pred in ['skos:exactMatch', 'skos:narrowMatch']: + # In other words, if the parent is mapped, and the mapping is either exact or narrower + parents.append(obj_data['subject_id']) + else: + pass # Its fine, just continue looking for other parents in this case if migrate and parents: - next_mondo_id = determine_next_available_mondo_id(min_id, termlist_mondo) # satrting from min_id, then counting up and checking if it does not already exist. + # TODO: implement this func: + next_mondo_id = determine_next_available_mondo_id(min_id, mondo_termlist_df) # satrting from min_id, then counting up and checking if it does not already exist. + # todo: Find the correct way of doing this: label = oaklib.get_label(t) + # todo: Find the correct way of doing this: definition = oaklib.get_definition(t) - data.append({'mondo_id':next_mondo_id, 'xref': t, 'label': label, 'definition': definition}) + data.append({'mondo_id': next_mondo_id, 'xref': t, 'label': label, 'definition': definition}) + + result = pd.DataFrame(data) + result.to_csv(outpath, sep="\t") + return result + - pandas.DataFrame(data).to_csv(fn, sep="\t") +def cli(): + """Command line interface.""" + package_description = \ + 'Slurp pipeline: Integrate new terms from other ontologies into Mondo.' + parser = ArgumentParser(description=package_description) + parser.add_argument( + '-o', '--ontology-path', required=True, + help='Path to ontology file, e.g. an `.owl` file.') + parser.add_argument( + '-m', '--sssom-map-path', required=True, + help='Path to file containing all known Mondo mappings, in SSSOM format.') + parser.add_argument( + '-i', '--min-id', required=True, + help='The ID from which we want to begin searching from in order to locate any currently unslurped terms.') + parser.add_argument( + '-t', '--mondo-terms-path', required=True, + help='Path to a file that contains a list of all Mondo terms.') + parser.add_argument( + '-O', '--outpath', required=True, + help='Path to save the output slurp `.tsv` file, containing list of new terms to integrate into Mondo.') + d: Dict = vars(parser.parse_args()) + # todo: Convert paths to absolute paths, as I've done before? Or expect always be run from src/ontology and ok? + run(**d) if __name__ == '__main__': - run() + cli()