Slurp pipeline

- Add: makefile: Missing make goals for dependencies for slurp goal. - Add: Python: CLI - Update: makefile: Slurp goal: (i) named keys/vals for all params, (ii) standardization in file/path params. - Update: Python: Completed script, inspired by initial psuedo code. (WIP)
monarch-initiative · Aug 4, 2022 · 9c9f296 · 9c9f296
1 parent 769decf
commit 9c9f296
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 46 deletions.
diff --git a/src/ontology/mondo-ingest.Makefile b/src/ontology/mondo-ingest.Makefile
@@ -3,7 +3,7 @@
 ## If you need to customize your Makefile, make
 ## changes here rather than in the main Makefile
 .PHONY: deploy-mondo-ingest build-mondo-ingest documentation mappings update-jinja-sparql-queries \
-report-mapping-annotations python-install-dependencies
+report-mapping-annotations python-install-dependencies slurp-all slurp-%
 
 ####################################
 ### Standard constants #############
@@ -157,7 +157,7 @@ metadata/mondo.sssom.config.yml:
 mappings: sssom $(ALL_MAPPINGS)
 
 #################
-# Utils #########
+##### Utils #####
 #################
 # Documentation for `report-mapping-annotations` and `update-jinja-sparql-queries`: `docs/developer/ordo.md`
 # TODO: When https://github.com/monarch-initiative/mondo-ingest/issues/43 is fixed, can change back to `requirements.txt`
@@ -172,6 +172,9 @@ update-jinja-sparql-queries: python-install-dependencies
 	python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_replace_annotation_based_mappings.py
 	python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_mapping_annotations_violation.py
 
+#################
+## Exclusions ###
+#################
 config/%_term_exclusions.txt: config/%_exclusions.tsv component-download-%.owl $(REPORTDIR)/mirror_signature-%.tsv $(REPORTDIR)/component_signature-%.tsv metadata/%.yml python-install-dependencies
 	python3 $(SCRIPTSDIR)/exclusion_term_expansion.py \
 	--onto-name $* \
@@ -237,7 +240,7 @@ tmp/mondo.sssom.tsv:
 	wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@
 
 tmp/mondo.sssom.ttl: tmp/mondo.sssom.tsv
-	sssom convert $< -O rdf -o $@
+	sssom convert $< -O rdf -o $@tmp/mondo.ow
 
 # Merge Mondo, precise mappings and mondo-ingest into one coherent whole for the purpose of querying.
 
@@ -250,6 +253,7 @@ $(REPORTDIR)/mondo_ordo_unsupported_subclass.tsv: ../sparql/mondo-ordo-unsupport
 .PHONY: mondo-ordo-subclass
 mondo-ordo-subclass: $(REPORTDIR)/mondo_ordo_unsupported_subclass.tsv
 
+# TODO: I believe we'll need a special case/goal for `reports/mirror_signature-mondo.tsv`, which will depend on `tmp/mondo.owl` instead.
 reports/mirror_signature-%.tsv: component-download-%.owl
 	$(ROBOT) query -i $(TMPDIR)/$<.owl --query ../sparql/classes.sparql $@
 
@@ -278,19 +282,30 @@ lexical_matches: mappings/mondo-sources-all-lexical.sssom.tsv
 #############################
 ###### Slurp pipeline #######
 #############################
+# TODO: (a) Move this to Makefile, or (b) refactor this away.
+.PHONY: component-download-mondo.owl
+component-download-mondo.owl: | $(TMPDIR)
+	if [ $(MIR) = true ] && [ $(COMP) = true ]; then $(ROBOT) merge -I http://purl.obolibrary.org/obo/mondo.owl \
+	annotate --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) -o $(TMPDIR)/$@.owl; fi
+
+# TODO: (a) Move this to Makefile, or (b) refactor this away, or (c) DELETE this goal.
+# ...I think (c) is most likely, as `reports/mirror_signature-%.tsv` needs `component-download-%.owl`.
+# ...`reports/component_signature-%.tsv` is what needs `components/%.owl`, but that shouldn't be needed here.
+# $(COMPONENTSDIR)/mondo.owl: component-download-mondo.owl
+# 	if [ $(COMP) = true ] ; then if cmp -s $(TMPDIR)/component-download-mondo.owl.owl $@ ; then echo "Component identical."; else echo "Component is different, updating." && cp $(TMPDIR)/component-download-mondo.owl.owl $@; fi; fi
+
 slurp/:
 	mkdir -p $@
 
-# Feel free to change the signature. Min ID is the next available Mondo ID.
-slurp/%.tsv: components/%.owl tmp/mondo.sssom.tsv reports/mirror-signature-mondo.tsv | slurp/
+# min-id: the next available Mondo ID
+slurp/%.tsv: $(COMPONENTSDIR)/%.owl $(TMPDIR)/mondo.sssom.tsv $(REPORTDIR)/mirror_signature-mondo.tsv | slurp/
 	python $(SCRIPTSDIR)/migrate.py \
-	-i $< \
-	--mapping-file tmp/mondo.sssom.tsv \
+	--ontology-path $(COMPONENTSDIR)/$*.owl \
+	--sssom-map-path $(TMPDIR)/mondo.sssom.tsv \
 	--min-id 123000 \
-	--mondo-terms reports/mirror-signature-mondo.tsv \
-	--output $@
+	--mondo-terms-path $(REPORTDIR)/mirror_signature-mondo.tsv \
+	--outpath $@
 
 slurp-%: slurp/%.tsv
 
-# TODO: add more ontologies, e.g.: doid, icd10cm, icd10who, ncit, ordo
-slurp: slurp-omim
+slurp-all: slurp-omim slurp-doid slurp-ncit slurp-ordo slurp-icd10cm slurp-icd10who
diff --git a/src/ontology/slurp/omim.tsv b/src/ontology/slurp/omim.tsv
@@ -0,0 +1 @@
+""
diff --git a/src/scripts/migrate.py b/src/scripts/migrate.py
@@ -1,54 +1,94 @@
-"""Migration pipeline
+"""Slurp migration pipeline
 
-#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING
-#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING
-#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING
-#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING
-
-TODOs:
+TODO's:
   - add CLI: look to makefile for what to include
 """
-import oakliblib
-import pandas
+import os
+from argparse import ArgumentParser
+from typing import Dict, List
 
+import pandas as pd
+import oaklib
+from oaklib.resource import OntologyResource
+from oaklib.implementations.sqldb.sql_implementation import SqlImplementation
 
-#Inputs:
-source_ontology = ''  #e.g. omim
-sssom_map = ''  # e.g. mondo.sssom.tsv
-min_id = ''
-termlist_mondo = ''
 
+# TODO: implement this func:
+# todo: IDs should be int or str? prolly str
+def determine_next_available_mondo_id(min_id: str, mondo_termlist_df: pd.DataFrame) -> str:
+    """Starting from `min_id`, count up and check until finding the next ID."""
+    next_id = str(0)
+    return next_id
 
-def run(source_ontology = '', sssom_map = '', min_id = '', termlist_mondo = ''):
-    """source_ontology = ''  #e.g. omim
-    sssom_map = ''  # e.g. mondo.sssom.tsv
-    min_id = ''
-    termlist_mondo = ''"""
-    #Outputs:
-    data = []
 
-    for t in source_ontology:
-        if t not in sssom_map['object_id']:
-            parents = []
+def run(ontology_path: str, sssom_map_path: str, min_id: str, mondo_terms_path: str, outpath: str) -> pd.DataFrame:
+    """Run slurp pipeline for given ontology"""
+    # TODO: read this with OAK
+    ontology = SqlImplementation(OntologyResource(slug=ontology_path, local=True))
+    sssom_df = pd.read_csv(sssom_map_path, comment='#', sep='\t')
+    # TODO: Need to get the mondo terms, but ran out of memory on my alternate PC. get from other PC.
+    print(f'exists: {mondo_terms_path}: ', os.path.exists(os.path.join(os.getcwd(), mondo_terms_path)))
+    mondo_termlist_df = pd.read_csv(mondo_terms_path, comment='#', sep='\t')
+
+    source_onto_terms: List[str] = []
+    sssom_object_ids = set(sssom_df['object_id'])
+
+    data = []
+    for t in source_onto_terms:
+        if t not in sssom_object_ids:
             migrate = True
-            for p in oaklib.get_direct_parents(t):
-                if p not in sssom_map['object_id']:
+            # todo: Find the correct way of doing this:
+            parents: List[str] = oaklib.get_direct_parents(t)
+            for parent in parents:
+                if parent not in sssom_object_ids:
                     migrate = False
                     break
-                elif sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:exactMatch' \
-                    or sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:narrowMatch':
-                    # In other words, if the parent is mapped, and the mapping is either exact or narrower
-                    parents.append(sssom_map[sssom_map['object_id']==p]['subject_id'])
                 else:
-                    # Its fine, just continue looking for other parents in this case
+                    obj_data = sssom_df[sssom_df['object_id'] == parent]
+                    pred = str(obj_data['predicate_id'])
+                    if pred in ['skos:exactMatch', 'skos:narrowMatch']:
+                        # In other words, if the parent is mapped, and the mapping is either exact or narrower
+                        parents.append(obj_data['subject_id'])
+                    else:
+                        pass  # Its fine, just continue looking for other parents in this case
             if migrate and parents:
-                next_mondo_id = determine_next_available_mondo_id(min_id, termlist_mondo) # satrting from min_id, then counting up and checking if it does not already exist.
+                # TODO: implement this func:
+                next_mondo_id = determine_next_available_mondo_id(min_id, mondo_termlist_df)  # satrting from min_id, then counting up and checking if it does not already exist.
+                # todo: Find the correct way of doing this:
                 label = oaklib.get_label(t)
+                # todo: Find the correct way of doing this:
                 definition = oaklib.get_definition(t)
-                data.append({'mondo_id':next_mondo_id, 'xref': t, 'label': label, 'definition': definition})
+                data.append({'mondo_id': next_mondo_id, 'xref': t, 'label': label, 'definition': definition})
+
+    result = pd.DataFrame(data)
+    result.to_csv(outpath, sep="\t")
+    return result
+
 
-    pandas.DataFrame(data).to_csv(fn, sep="\t")
+def cli():
+    """Command line interface."""
+    package_description = \
+        'Slurp pipeline: Integrate new terms from other ontologies into Mondo.'
+    parser = ArgumentParser(description=package_description)
+    parser.add_argument(
+        '-o', '--ontology-path', required=True,
+        help='Path to ontology file, e.g. an `.owl` file.')
+    parser.add_argument(
+        '-m', '--sssom-map-path', required=True,
+        help='Path to file containing all known Mondo mappings, in SSSOM format.')
+    parser.add_argument(
+        '-i', '--min-id', required=True,
+        help='The ID from which we want to begin searching from in order to locate any currently unslurped terms.')
+    parser.add_argument(
+        '-t', '--mondo-terms-path', required=True,
+        help='Path to a file that contains a list of all Mondo terms.')
+    parser.add_argument(
+        '-O', '--outpath', required=True,
+        help='Path to save the output slurp `.tsv` file, containing list of new terms to integrate into Mondo.')
+    d: Dict = vars(parser.parse_args())
+    # todo: Convert paths to absolute paths, as I've done before? Or expect always be run from src/ontology and ok?
+    run(**d)
 
 
 if __name__ == '__main__':
-    run()
+    cli()