From 0a4abdf40a8d79a77907e04886aad66907b3a572 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Wed, 3 Aug 2022 16:10:53 -0400 Subject: [PATCH] Slurp pipeline - Add: makefile: Missing make goals for dependencies for slurp goal. - Add: Python: CLI - Update: makefile: Slurp goal: (i) named keys/vals for all params, (ii) standardization in file/path params. - Update: Python: Completed script, inspired by initial psuedo code. (WIP) --- src/ontology/mondo-ingest.Makefile | 27 ++++--- src/scripts/migrate.py | 109 ++++++++++++++++++++--------- 2 files changed, 93 insertions(+), 43 deletions(-) diff --git a/src/ontology/mondo-ingest.Makefile b/src/ontology/mondo-ingest.Makefile index d6a716a86..91f62d1c6 100644 --- a/src/ontology/mondo-ingest.Makefile +++ b/src/ontology/mondo-ingest.Makefile @@ -281,16 +281,27 @@ lexical_matches: mappings/mondo-sources-all-lexical.sssom.tsv slurp/: mkdir -p $@ -# Feel free to change the signature. Min ID is the next available Mondo ID. -slurp/%.tsv: components/%.owl tmp/mondo.sssom.tsv reports/mirror-signature-mondo.tsv | slurp/ +# TODO: Move this to Makefile. how, when I shouldn't edit it from this repo? +.PHONY: component-download-mondo.owl +component-download-mondo.owl: | $(TMPDIR) + if [ $(MIR) = true ] && [ $(COMP) = true ]; then $(ROBOT) merge -I http://purl.obolibrary.org/obo/mondo.owl \ + annotate --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) -o $(TMPDIR)/$@.owl; fi + +# TODO: Move this to Makefile. how, when I shouldn't edit it from this repo? +# $(COMPONENTSDIR)/mondo.owl: component-download-mondo.owl +# if [ $(COMP) = true ] ; then if cmp -s $(TMPDIR)/component-download-mondo.owl.owl $@ ; then echo "Component identical."; else echo "Component is different, updating." && cp $(TMPDIR)/component-download-mondo.owl.owl $@; fi; fi + +# min-id: the next available Mondo ID +slurp/%.tsv: $(COMPONENTSDIR)/%.owl $(TMPDIR)/mondo.sssom.tsv $(REPORTDIR)/mirror_signature-mondo.tsv | slurp/ python $(SCRIPTSDIR)/migrate.py \ - -i $< \ - --mapping-file tmp/mondo.sssom.tsv \ + --ontology-path $(COMPONENTSDIR)/%.owl \ + --sssom-map-path $(TMPDIR)/mondo.sssom.tsv \ --min-id 123000 \ - --mondo-terms reports/mirror-signature-mondo.tsv \ - --output $@ + --mondo-terms-path $(REPORTDIR)/mirror_signature-mondo.tsv \ + --outpath $@ slurp-%: slurp/%.tsv -# TODO: add more ontologies, e.g.: doid, icd10cm, icd10who, ncit, ordo -slurp: slurp-omim +# TODO: change to all ontologies when ready +# slurp-all: slurp-omim slurp-doid slurp-ncit slurp-ordo slurp-icd10cm slurp-icd10who +slurp-all: slurp-omim diff --git a/src/scripts/migrate.py b/src/scripts/migrate.py index 97368979f..87e9a8ca3 100644 --- a/src/scripts/migrate.py +++ b/src/scripts/migrate.py @@ -1,54 +1,93 @@ -"""Migration pipeline +"""Slurp migration pipeline -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING -#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING - -TODOs: +TODO's: - add CLI: look to makefile for what to include """ -import oakliblib -import pandas +import os +from argparse import ArgumentParser +from typing import Dict, List +import oaklib +import pandas as pd -#Inputs: -source_ontology = '' #e.g. omim -sssom_map = '' # e.g. mondo.sssom.tsv -min_id = '' -termlist_mondo = '' +# TODO: implement this func: +# todo: IDs should be int or str? prolly str +def determine_next_available_mondo_id(min_id: str, mondo_termlist_df: pd.DataFrame) -> str: + """Starting from `min_id`, count up and check until finding the next ID.""" + next_id = str(0) + return next_id -def run(source_ontology = '', sssom_map = '', min_id = '', termlist_mondo = ''): - """source_ontology = '' #e.g. omim - sssom_map = '' # e.g. mondo.sssom.tsv - min_id = '' - termlist_mondo = ''""" - #Outputs: - data = [] - for t in source_ontology: - if t not in sssom_map['object_id']: - parents = [] +def run(ontology_path: str, sssom_map_path: str, min_id: str, mondo_terms_path: str, outpath: str) -> pd.DataFrame: + """Run slurp pipeline for given ontology""" + # TODO: read this with OAK + source_ontology = ontology_path + sssom_df = pd.read_csv(sssom_map_path, comment='#', sep='\t') + # TODO: Need to get the mondo terms, but ran out of memory on my alternate PC. get from other PC. + print(f'exists: {mondo_terms_path}: ', os.path.exists(os.path.join(os.getcwd(), mondo_terms_path))) + # mondo_termlist_df = pd.read_csv(mondo_terms_path, comment='#', sep='\t') + mondo_termlist_df = pd.DataFrame() + + source_onto_terms: List[str] = [] + sssom_object_ids = set(sssom_df['object_id']) + + data = [] + for t in source_onto_terms: + if t not in sssom_object_ids: migrate = True - for p in oaklib.get_direct_parents(t): - if p not in sssom_map['object_id']: + # todo: Find the correct way of doing this: + parents: List[str] = oaklib.get_direct_parents(t) + for parent in parents: + if parent not in sssom_object_ids: migrate = False break - elif sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:exactMatch' \ - or sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:narrowMatch': - # In other words, if the parent is mapped, and the mapping is either exact or narrower - parents.append(sssom_map[sssom_map['object_id']==p]['subject_id']) else: - # Its fine, just continue looking for other parents in this case + obj_data = sssom_df[sssom_df['object_id'] == parent] + pred = str(obj_data['predicate_id']) + if pred in ['skos:exactMatch', 'skos:narrowMatch']: + # In other words, if the parent is mapped, and the mapping is either exact or narrower + parents.append(obj_data['subject_id']) + else: + pass # Its fine, just continue looking for other parents in this case if migrate and parents: - next_mondo_id = determine_next_available_mondo_id(min_id, termlist_mondo) # satrting from min_id, then counting up and checking if it does not already exist. + # TODO: implement this func: + next_mondo_id = determine_next_available_mondo_id(min_id, mondo_termlist_df) # satrting from min_id, then counting up and checking if it does not already exist. + # todo: Find the correct way of doing this: label = oaklib.get_label(t) + # todo: Find the correct way of doing this: definition = oaklib.get_definition(t) - data.append({'mondo_id':next_mondo_id, 'xref': t, 'label': label, 'definition': definition}) + data.append({'mondo_id': next_mondo_id, 'xref': t, 'label': label, 'definition': definition}) + + result = pd.DataFrame(data) + result.to_csv(outpath, sep="\t") + return result + - pandas.DataFrame(data).to_csv(fn, sep="\t") +def cli(): + """Command line interface.""" + package_description = \ + 'Slurp pipeline: Integrate new terms from other ontologies into Mondo.' + parser = ArgumentParser(description=package_description) + parser.add_argument( + '-o', '--ontology-path', required=True, + help='Path to ontology file, e.g. an `.owl` file.') + parser.add_argument( + '-m', '--sssom-map-path', required=True, + help='Path to file containing all known Mondo mappings, in SSSOM format.') + parser.add_argument( + '-i', '--min-id', required=True, + help='The ID from which we want to begin searching from in order to locate any currently unslurped terms.') + parser.add_argument( + '-t', '--mondo-terms-path', required=True, + help='Path to a file that contains a list of all Mondo terms.') + parser.add_argument( + '-O', '--outpath', required=True, + help='Path to save the output slurp `.tsv` file, containing list of new terms to integrate into Mondo.') + d: Dict = vars(parser.parse_args()) + # todo: Convert paths to absolute paths, as I've done before? Or expect always be run from src/ontology and ok? + run(**d) if __name__ == '__main__': - run() + cli()