Skip to content

Commit

Permalink
Slurp pipeline
Browse files Browse the repository at this point in the history
- Add: makefile: Missing make goals for dependencies for slurp goal.
- Add: Python: CLI
- Update: makefile: Slurp goal: (i) named keys/vals for all params, (ii) standardization in file/path params.
- Update: Python: Completed script, inspired by initial psuedo code. (WIP)
  • Loading branch information
joeflack4 committed Aug 5, 2022
1 parent 769decf commit de81b23
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 48 deletions.
37 changes: 26 additions & 11 deletions src/ontology/mondo-ingest.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## If you need to customize your Makefile, make
## changes here rather than in the main Makefile
.PHONY: deploy-mondo-ingest build-mondo-ingest documentation mappings update-jinja-sparql-queries \
report-mapping-annotations python-install-dependencies
report-mapping-annotations python-install-dependencies slurp-all slurp-%

####################################
### Standard constants #############
Expand Down Expand Up @@ -157,7 +157,7 @@ metadata/mondo.sssom.config.yml:
mappings: sssom $(ALL_MAPPINGS)

#################
# Utils #########
##### Utils #####
#################
# Documentation for `report-mapping-annotations` and `update-jinja-sparql-queries`: `docs/developer/ordo.md`
# TODO: When https://github.com/monarch-initiative/mondo-ingest/issues/43 is fixed, can change back to `requirements.txt`
Expand All @@ -172,6 +172,9 @@ update-jinja-sparql-queries: python-install-dependencies
python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_replace_annotation_based_mappings.py
python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_mapping_annotations_violation.py

#################
## Exclusions ###
#################
config/%_term_exclusions.txt: config/%_exclusions.tsv component-download-%.owl $(REPORTDIR)/mirror_signature-%.tsv $(REPORTDIR)/component_signature-%.tsv metadata/%.yml python-install-dependencies
python3 $(SCRIPTSDIR)/exclusion_term_expansion.py \
--onto-name $* \
Expand Down Expand Up @@ -237,7 +240,7 @@ tmp/mondo.sssom.tsv:
wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@

tmp/mondo.sssom.ttl: tmp/mondo.sssom.tsv
sssom convert $< -O rdf -o $@
sssom convert $< -O rdf -o $@tmp/mondo.ow

# Merge Mondo, precise mappings and mondo-ingest into one coherent whole for the purpose of querying.

Expand All @@ -250,6 +253,7 @@ $(REPORTDIR)/mondo_ordo_unsupported_subclass.tsv: ../sparql/mondo-ordo-unsupport
.PHONY: mondo-ordo-subclass
mondo-ordo-subclass: $(REPORTDIR)/mondo_ordo_unsupported_subclass.tsv

# TODO: I believe we'll need a special case/goal for `reports/mirror_signature-mondo.tsv`, which will depend on `tmp/mondo.owl` instead.
reports/mirror_signature-%.tsv: component-download-%.owl
$(ROBOT) query -i $(TMPDIR)/$<.owl --query ../sparql/classes.sparql $@

Expand Down Expand Up @@ -278,19 +282,30 @@ lexical_matches: mappings/mondo-sources-all-lexical.sssom.tsv
#############################
###### Slurp pipeline #######
#############################
# TODO: (a) Move this to Makefile, or (b) refactor this away.
.PHONY: component-download-mondo.owl
component-download-mondo.owl: | $(TMPDIR)
if [ $(MIR) = true ] && [ $(COMP) = true ]; then $(ROBOT) merge -I http://purl.obolibrary.org/obo/mondo.owl \
annotate --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) -o $(TMPDIR)/$@.owl; fi

# TODO: (a) Move this to Makefile, or (b) refactor this away, or (c) DELETE this goal.
# ...I think (c) is most likely, as `reports/mirror_signature-%.tsv` needs `component-download-%.owl`.
# ...`reports/component_signature-%.tsv` is what needs `components/%.owl`, but that shouldn't be needed here.
# $(COMPONENTSDIR)/mondo.owl: component-download-mondo.owl
# if [ $(COMP) = true ] ; then if cmp -s $(TMPDIR)/component-download-mondo.owl.owl $@ ; then echo "Component identical."; else echo "Component is different, updating." && cp $(TMPDIR)/component-download-mondo.owl.owl $@; fi; fi

slurp/:
mkdir -p $@

# Feel free to change the signature. Min ID is the next available Mondo ID.
slurp/%.tsv: components/%.owl tmp/mondo.sssom.tsv reports/mirror-signature-mondo.tsv | slurp/
# min-id: the next available Mondo ID
slurp/%.tsv: $(COMPONENTSDIR)/%.owl $(TMPDIR)/mondo.sssom.tsv $(REPORTDIR)/mirror_signature-mondo.tsv | slurp/
python $(SCRIPTSDIR)/migrate.py \
-i $< \
--mapping-file tmp/mondo.sssom.tsv \
--ontology-path $(COMPONENTSDIR)/$*.owl \
--sssom-map-path $(TMPDIR)/mondo.sssom.tsv \
--min-id 123000 \
--mondo-terms reports/mirror-signature-mondo.tsv \
--output $@
--mondo-terms-path $(REPORTDIR)/mirror_signature-mondo.tsv \
--outpath $@

slurp-%: slurp/%.tsv

# TODO: add more ontologies, e.g.: doid, icd10cm, icd10who, ncit, ordo
slurp: slurp-omim
slurp-all: slurp-omim slurp-doid slurp-ncit slurp-ordo slurp-icd10cm slurp-icd10who
158 changes: 121 additions & 37 deletions src/scripts/migrate.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,138 @@
"""Migration pipeline
"""Slurp migration pipeline
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING
TODOs:
- add CLI: look to makefile for what to include
TODO's:
-
"""
import oakliblib
import pandas
import os.path
from argparse import ArgumentParser
from typing import Dict, List

import pandas as pd
import oaklib
from oaklib.implementations import ProntoImplementation
from oaklib.resource import OntologyResource


# TODO: implement this func:
# todo: IDs should be int or str? prolly str
def determine_next_available_mondo_id(min_id: str, mondo_termlist_df: pd.DataFrame) -> str:
"""Starting from `min_id`, count up and check until finding the next ID."""
next_id = str(0)
return next_id


def _get_direct_parents(curie: str, ontology: ProntoImplementation):
"""Get CURIEs of parents of a class
todo: `ontology` typing: should probably be typed agnostic of the implementation. But it inherits from so
much stuff, I'm not sure which if any of these to use. I was expecting `BasicOntologyInterface`, but it doesn't
inherit from that: RelationGraphInterface, OboGraphInterface, ValidatorInterface, SearchInterface,
SubsetterInterface, MappingProviderInterface, PatcherInterface, SemanticSimilarityInterface, MetadataInterface,
DifferInterface, ABC
"""
# TODO: How to get just the *direct* parents? (a) func for that? (b) look at relationships and see if subClass?
# rels = ontology.get_outgoing_relationships_by_curie(curie) # works for ProntoImplementation
rels = ontology.relationships([curie])
p = []
for rel, parents in rels.items():
# print(f' {rel} ! {ontology.get_label_by_curie(rel)}')
for parent in parents:
# print(f' {parent} ! {ontology.get_label_by_curie(parent)}')
p.append(parent)
return p

#Inputs:
source_ontology = '' #e.g. omim
sssom_map = '' # e.g. mondo.sssom.tsv
min_id = ''
termlist_mondo = ''

def _get_all_term_curies(ontology: ProntoImplementation, is_omim=False) -> List[str]:
"""Get all terms as CURIEs
todo: Will we need prefix_map from our config `.yml`s? Or can OAK handle automatically?
- reasons we might need: (i) filter correct terms, (ii) convert URI to CURIE if get URI
"""
# todo: if oak can't handle automatically, I think we'll need `prefix_map`:
terms = [x for x in ontology.entities()]
# todo: temp until I figure out if (a) we need `prefix_map`, or (b) OAK can somehow handle automatically:
if is_omim:
terms = [x for x in terms if any([x.startswith(y) for y in [
'OMIM:', 'OMIMPS:', 'https://omim.org/entry/', 'https://omim.org/phenotypicSeries/PS']])]
# prefix_map = {
# 'OMIM': 'https://omim.org/entry/',
# 'OMIMPS': 'https://omim.org/phenotypicSeries/PS',
# }
# todo: convert to CURIE using charlie's new lib
return terms

def run(source_ontology = '', sssom_map = '', min_id = '', termlist_mondo = ''):
"""source_ontology = '' #e.g. omim
sssom_map = '' # e.g. mondo.sssom.tsv
min_id = ''
termlist_mondo = ''"""
#Outputs:
data = []

for t in source_ontology:
if t not in sssom_map['object_id']:
parents = []
def run(ontology_path: str, sssom_map_path: str, min_id: str, mondo_terms_path: str, outpath: str) -> pd.DataFrame:
"""Run slurp pipeline for given ontology"""
# Read source files
# todo: `ontology`: If trouble, can try `SparqlImplementation`, but ~6 min to load and queries slow (cuz rdflib)
ontology = ProntoImplementation(OntologyResource(slug=ontology_path, local=True))
sssom_df = pd.read_csv(sssom_map_path, comment='#', sep='\t')
mondo_termlist_df = pd.read_csv(mondo_terms_path, comment='#', sep='\t')

# Initialize variables
# todo: is_omim is temporary
source_onto_terms: List[str] = _get_all_term_curies(ontology, is_omim='omim' in os.path.basename(ontology_path))
sssom_object_ids = set(sssom_df['object_id'])

# Get slurpable terms
slurpable_terms = []
for t in source_onto_terms:
if t not in sssom_object_ids:
migrate = True
for p in oaklib.get_direct_parents(t):
if p not in sssom_map['object_id']:
# todo: Find the correct way of doing this:
# todo: probably best to add all t[] at once, given the OAK api
direct_parents: List[str] = _get_direct_parents(t, ontology)
slurpable_parents: List[str] = []
for parent in direct_parents:
if parent not in sssom_object_ids:
migrate = False
break
elif sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:exactMatch' \
or sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:narrowMatch':
# In other words, if the parent is mapped, and the mapping is either exact or narrower
parents.append(sssom_map[sssom_map['object_id']==p]['subject_id'])
else:
# Its fine, just continue looking for other parents in this case
if migrate and parents:
next_mondo_id = determine_next_available_mondo_id(min_id, termlist_mondo) # satrting from min_id, then counting up and checking if it does not already exist.
obj_data = sssom_df[sssom_df['object_id'] == parent]
pred = str(obj_data['predicate_id'])
if pred in ['skos:exactMatch', 'skos:narrowMatch']:
# In other words, if the parent is mapped, and the mapping is either exact or narrower, OK to add
# todo: Ok to add
slurpable_parents.append(obj_data['subject_id'])
else:
pass # Its fine, just continue looking for other parents in this case
if migrate and slurpable_parents:
# TODO: implement this func:
next_mondo_id = determine_next_available_mondo_id(min_id, mondo_termlist_df) # satrting from min_id, then counting up and checking if it does not already exist.
# todo: Find the correct way of doing this:
label = oaklib.get_label(t)
# todo: Find the correct way of doing this:
definition = oaklib.get_definition(t)
data.append({'mondo_id':next_mondo_id, 'xref': t, 'label': label, 'definition': definition})
slurpable_terms.append({'mondo_id': next_mondo_id, 'xref': t, 'label': label, 'definition': definition})

result = pd.DataFrame(slurpable_terms)
result.to_csv(outpath, sep="\t")
return result


pandas.DataFrame(data).to_csv(fn, sep="\t")
def cli():
"""Command line interface."""
package_description = \
'Slurp pipeline: Integrate new terms from other ontologies into Mondo.'
parser = ArgumentParser(description=package_description)
parser.add_argument(
'-o', '--ontology-path', required=True,
help='Path to ontology file, e.g. an `.owl` file.')
parser.add_argument(
'-m', '--sssom-map-path', required=True,
help='Path to file containing all known Mondo mappings, in SSSOM format.')
parser.add_argument(
'-i', '--min-id', required=True,
help='The ID from which we want to begin searching from in order to locate any currently unslurped terms.')
parser.add_argument(
'-t', '--mondo-terms-path', required=True,
help='Path to a file that contains a list of all Mondo terms.')
parser.add_argument(
'-O', '--outpath', required=True,
help='Path to save the output slurp `.tsv` file, containing list of new terms to integrate into Mondo.')
d: Dict = vars(parser.parse_args())
# todo: Convert paths to absolute paths, as I've done before? Or expect always be run from src/ontology and ok?
run(**d)


if __name__ == '__main__':
run()
cli()

0 comments on commit de81b23

Please sign in to comment.