Skip to content

Commit

Permalink
Slurp pipeline
Browse files Browse the repository at this point in the history
- Add: makefile: Missing make goals for dependencies for slurp goal.
- Add: Python: CLI
- Update: makefile: Slurp goal: (i) named keys/vals for all params, (ii) standardization in file/path params.
- Update: Python: Completed script, inspired by initial psuedo code. (WIP)
  • Loading branch information
joeflack4 committed Aug 4, 2022
1 parent 769decf commit 9c9f296
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 46 deletions.
37 changes: 26 additions & 11 deletions src/ontology/mondo-ingest.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## If you need to customize your Makefile, make
## changes here rather than in the main Makefile
.PHONY: deploy-mondo-ingest build-mondo-ingest documentation mappings update-jinja-sparql-queries \
report-mapping-annotations python-install-dependencies
report-mapping-annotations python-install-dependencies slurp-all slurp-%

####################################
### Standard constants #############
Expand Down Expand Up @@ -157,7 +157,7 @@ metadata/mondo.sssom.config.yml:
mappings: sssom $(ALL_MAPPINGS)

#################
# Utils #########
##### Utils #####
#################
# Documentation for `report-mapping-annotations` and `update-jinja-sparql-queries`: `docs/developer/ordo.md`
# TODO: When https://github.com/monarch-initiative/mondo-ingest/issues/43 is fixed, can change back to `requirements.txt`
Expand All @@ -172,6 +172,9 @@ update-jinja-sparql-queries: python-install-dependencies
python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_replace_annotation_based_mappings.py
python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_mapping_annotations_violation.py

#################
## Exclusions ###
#################
config/%_term_exclusions.txt: config/%_exclusions.tsv component-download-%.owl $(REPORTDIR)/mirror_signature-%.tsv $(REPORTDIR)/component_signature-%.tsv metadata/%.yml python-install-dependencies
python3 $(SCRIPTSDIR)/exclusion_term_expansion.py \
--onto-name $* \
Expand Down Expand Up @@ -237,7 +240,7 @@ tmp/mondo.sssom.tsv:
wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@

tmp/mondo.sssom.ttl: tmp/mondo.sssom.tsv
sssom convert $< -O rdf -o $@
sssom convert $< -O rdf -o $@tmp/mondo.ow

# Merge Mondo, precise mappings and mondo-ingest into one coherent whole for the purpose of querying.

Expand All @@ -250,6 +253,7 @@ $(REPORTDIR)/mondo_ordo_unsupported_subclass.tsv: ../sparql/mondo-ordo-unsupport
.PHONY: mondo-ordo-subclass
mondo-ordo-subclass: $(REPORTDIR)/mondo_ordo_unsupported_subclass.tsv

# TODO: I believe we'll need a special case/goal for `reports/mirror_signature-mondo.tsv`, which will depend on `tmp/mondo.owl` instead.
reports/mirror_signature-%.tsv: component-download-%.owl
$(ROBOT) query -i $(TMPDIR)/$<.owl --query ../sparql/classes.sparql $@

Expand Down Expand Up @@ -278,19 +282,30 @@ lexical_matches: mappings/mondo-sources-all-lexical.sssom.tsv
#############################
###### Slurp pipeline #######
#############################
# TODO: (a) Move this to Makefile, or (b) refactor this away.
.PHONY: component-download-mondo.owl
component-download-mondo.owl: | $(TMPDIR)
if [ $(MIR) = true ] && [ $(COMP) = true ]; then $(ROBOT) merge -I http://purl.obolibrary.org/obo/mondo.owl \
annotate --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) -o $(TMPDIR)/$@.owl; fi

# TODO: (a) Move this to Makefile, or (b) refactor this away, or (c) DELETE this goal.
# ...I think (c) is most likely, as `reports/mirror_signature-%.tsv` needs `component-download-%.owl`.
# ...`reports/component_signature-%.tsv` is what needs `components/%.owl`, but that shouldn't be needed here.
# $(COMPONENTSDIR)/mondo.owl: component-download-mondo.owl
# if [ $(COMP) = true ] ; then if cmp -s $(TMPDIR)/component-download-mondo.owl.owl $@ ; then echo "Component identical."; else echo "Component is different, updating." && cp $(TMPDIR)/component-download-mondo.owl.owl $@; fi; fi

slurp/:
mkdir -p $@

# Feel free to change the signature. Min ID is the next available Mondo ID.
slurp/%.tsv: components/%.owl tmp/mondo.sssom.tsv reports/mirror-signature-mondo.tsv | slurp/
# min-id: the next available Mondo ID
slurp/%.tsv: $(COMPONENTSDIR)/%.owl $(TMPDIR)/mondo.sssom.tsv $(REPORTDIR)/mirror_signature-mondo.tsv | slurp/
python $(SCRIPTSDIR)/migrate.py \
-i $< \
--mapping-file tmp/mondo.sssom.tsv \
--ontology-path $(COMPONENTSDIR)/$*.owl \
--sssom-map-path $(TMPDIR)/mondo.sssom.tsv \
--min-id 123000 \
--mondo-terms reports/mirror-signature-mondo.tsv \
--output $@
--mondo-terms-path $(REPORTDIR)/mirror_signature-mondo.tsv \
--outpath $@

slurp-%: slurp/%.tsv

# TODO: add more ontologies, e.g.: doid, icd10cm, icd10who, ncit, ordo
slurp: slurp-omim
slurp-all: slurp-omim slurp-doid slurp-ncit slurp-ordo slurp-icd10cm slurp-icd10who
1 change: 1 addition & 0 deletions src/ontology/slurp/omim.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
""
110 changes: 75 additions & 35 deletions src/scripts/migrate.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,94 @@
"""Migration pipeline
"""Slurp migration pipeline
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING
TODOs:
TODO's:
- add CLI: look to makefile for what to include
"""
import oakliblib
import pandas
import os
from argparse import ArgumentParser
from typing import Dict, List

import pandas as pd
import oaklib
from oaklib.resource import OntologyResource
from oaklib.implementations.sqldb.sql_implementation import SqlImplementation

#Inputs:
source_ontology = '' #e.g. omim
sssom_map = '' # e.g. mondo.sssom.tsv
min_id = ''
termlist_mondo = ''

# TODO: implement this func:
# todo: IDs should be int or str? prolly str
def determine_next_available_mondo_id(min_id: str, mondo_termlist_df: pd.DataFrame) -> str:
"""Starting from `min_id`, count up and check until finding the next ID."""
next_id = str(0)
return next_id

def run(source_ontology = '', sssom_map = '', min_id = '', termlist_mondo = ''):
"""source_ontology = '' #e.g. omim
sssom_map = '' # e.g. mondo.sssom.tsv
min_id = ''
termlist_mondo = ''"""
#Outputs:
data = []

for t in source_ontology:
if t not in sssom_map['object_id']:
parents = []
def run(ontology_path: str, sssom_map_path: str, min_id: str, mondo_terms_path: str, outpath: str) -> pd.DataFrame:
"""Run slurp pipeline for given ontology"""
# TODO: read this with OAK
ontology = SqlImplementation(OntologyResource(slug=ontology_path, local=True))
sssom_df = pd.read_csv(sssom_map_path, comment='#', sep='\t')
# TODO: Need to get the mondo terms, but ran out of memory on my alternate PC. get from other PC.
print(f'exists: {mondo_terms_path}: ', os.path.exists(os.path.join(os.getcwd(), mondo_terms_path)))
mondo_termlist_df = pd.read_csv(mondo_terms_path, comment='#', sep='\t')

source_onto_terms: List[str] = []
sssom_object_ids = set(sssom_df['object_id'])

data = []
for t in source_onto_terms:
if t not in sssom_object_ids:
migrate = True
for p in oaklib.get_direct_parents(t):
if p not in sssom_map['object_id']:
# todo: Find the correct way of doing this:
parents: List[str] = oaklib.get_direct_parents(t)
for parent in parents:
if parent not in sssom_object_ids:
migrate = False
break
elif sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:exactMatch' \
or sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:narrowMatch':
# In other words, if the parent is mapped, and the mapping is either exact or narrower
parents.append(sssom_map[sssom_map['object_id']==p]['subject_id'])
else:
# Its fine, just continue looking for other parents in this case
obj_data = sssom_df[sssom_df['object_id'] == parent]
pred = str(obj_data['predicate_id'])
if pred in ['skos:exactMatch', 'skos:narrowMatch']:
# In other words, if the parent is mapped, and the mapping is either exact or narrower
parents.append(obj_data['subject_id'])
else:
pass # Its fine, just continue looking for other parents in this case
if migrate and parents:
next_mondo_id = determine_next_available_mondo_id(min_id, termlist_mondo) # satrting from min_id, then counting up and checking if it does not already exist.
# TODO: implement this func:
next_mondo_id = determine_next_available_mondo_id(min_id, mondo_termlist_df) # satrting from min_id, then counting up and checking if it does not already exist.
# todo: Find the correct way of doing this:
label = oaklib.get_label(t)
# todo: Find the correct way of doing this:
definition = oaklib.get_definition(t)
data.append({'mondo_id':next_mondo_id, 'xref': t, 'label': label, 'definition': definition})
data.append({'mondo_id': next_mondo_id, 'xref': t, 'label': label, 'definition': definition})

result = pd.DataFrame(data)
result.to_csv(outpath, sep="\t")
return result


pandas.DataFrame(data).to_csv(fn, sep="\t")
def cli():
"""Command line interface."""
package_description = \
'Slurp pipeline: Integrate new terms from other ontologies into Mondo.'
parser = ArgumentParser(description=package_description)
parser.add_argument(
'-o', '--ontology-path', required=True,
help='Path to ontology file, e.g. an `.owl` file.')
parser.add_argument(
'-m', '--sssom-map-path', required=True,
help='Path to file containing all known Mondo mappings, in SSSOM format.')
parser.add_argument(
'-i', '--min-id', required=True,
help='The ID from which we want to begin searching from in order to locate any currently unslurped terms.')
parser.add_argument(
'-t', '--mondo-terms-path', required=True,
help='Path to a file that contains a list of all Mondo terms.')
parser.add_argument(
'-O', '--outpath', required=True,
help='Path to save the output slurp `.tsv` file, containing list of new terms to integrate into Mondo.')
d: Dict = vars(parser.parse_args())
# todo: Convert paths to absolute paths, as I've done before? Or expect always be run from src/ontology and ok?
run(**d)


if __name__ == '__main__':
run()
cli()

0 comments on commit 9c9f296

Please sign in to comment.