-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Add: makefile: Missing make goals for dependencies for slurp goal. - Add: Python: CLI - Update: makefile: Slurp goal: (i) named keys/vals for all params, (ii) standardization in file/path params. - Update: Python: Completed script, inspired by initial psuedo code. (WIP)
- Loading branch information
Showing
2 changed files
with
150 additions
and
48 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,54 +1,141 @@ | ||
"""Migration pipeline | ||
"""Slurp migration pipeline | ||
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING | ||
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING | ||
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING | ||
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING | ||
TODOs: | ||
- add CLI: look to makefile for what to include | ||
TODO's: | ||
- | ||
""" | ||
import oakliblib | ||
import pandas | ||
import os.path | ||
from argparse import ArgumentParser | ||
from typing import Dict, List | ||
|
||
import pandas as pd | ||
import oaklib | ||
from oaklib.implementations import ProntoImplementation | ||
from oaklib.resource import OntologyResource | ||
|
||
|
||
# TODO: implement this func: | ||
# todo: IDs should be int or str? prolly str | ||
def determine_next_available_mondo_id(min_id: str, mondo_termlist_df: pd.DataFrame) -> str: | ||
"""Starting from `min_id`, count up and check until finding the next ID.""" | ||
next_id = str(0) | ||
return next_id | ||
|
||
|
||
def _get_direct_parents(curie: str, ontology: ProntoImplementation): | ||
"""Get CURIEs of parents of a class | ||
todo: `ontology` typing: should probably be typed agnostic of the implementation. But it inherits from so | ||
much stuff, I'm not sure which if any of these to use. I was expecting `BasicOntologyInterface`, but it doesn't | ||
inherit from that: RelationGraphInterface, OboGraphInterface, ValidatorInterface, SearchInterface, | ||
SubsetterInterface, MappingProviderInterface, PatcherInterface, SemanticSimilarityInterface, MetadataInterface, | ||
DifferInterface, ABC | ||
""" | ||
# TODO: How to get just the *direct* parents? (a) func for that? (b) look at relationships and see if subClass? | ||
# rels = ontology.get_outgoing_relationships_by_curie(curie) # works for ProntoImplementation | ||
rels = ontology.relationships([curie]) | ||
p = [] | ||
for rel, parents in rels.items(): | ||
# print(f' {rel} ! {ontology.get_label_by_curie(rel)}') | ||
for parent in parents: | ||
# print(f' {parent} ! {ontology.get_label_by_curie(parent)}') | ||
p.append(parent) | ||
return p | ||
|
||
#Inputs: | ||
source_ontology = '' #e.g. omim | ||
sssom_map = '' # e.g. mondo.sssom.tsv | ||
min_id = '' | ||
termlist_mondo = '' | ||
|
||
def _get_all_term_curies(ontology: ProntoImplementation, is_omim=False) -> List[str]: | ||
"""Get all terms as CURIEs | ||
todo: Will we need prefix_map from our config `.yml`s? Or can OAK handle automatically? | ||
- reasons we might need: (i) filter correct terms, (ii) convert URI to CURIE if get URI | ||
""" | ||
# todo: if oak can't handle automatically, I think we'll need `prefix_map`: | ||
terms = [x for x in ontology.entities()] | ||
# todo: temp until I figure out if (a) we need `prefix_map`, or (b) OAK can somehow handle automatically: | ||
if is_omim: | ||
terms = [x for x in terms if any([x.startswith(y) for y in [ | ||
'OMIM:', 'OMIMPS:', 'https://omim.org/entry/', 'https://omim.org/phenotypicSeries/PS']])] | ||
# prefix_map = { | ||
# 'OMIM': 'https://omim.org/entry/', | ||
# 'OMIMPS': 'https://omim.org/phenotypicSeries/PS', | ||
# } | ||
# todo: convert to CURIE using charlie's new lib | ||
return terms | ||
|
||
def run(source_ontology = '', sssom_map = '', min_id = '', termlist_mondo = ''): | ||
"""source_ontology = '' #e.g. omim | ||
sssom_map = '' # e.g. mondo.sssom.tsv | ||
min_id = '' | ||
termlist_mondo = ''""" | ||
#Outputs: | ||
data = [] | ||
|
||
for t in source_ontology: | ||
if t not in sssom_map['object_id']: | ||
parents = [] | ||
# todo: temp links: | ||
# https://incatools.github.io/ontology-access-kit/ | ||
# https://incatools.github.io/ontology-access-kit/intro/tutorial02.html | ||
def run(ontology_path: str, sssom_map_path: str, min_id: str, mondo_terms_path: str, outpath: str) -> pd.DataFrame: | ||
"""Run slurp pipeline for given ontology""" | ||
# Read source files | ||
# todo: `ontology`: If trouble, can try `SparqlImplementation`, but ~6 min to load and queries slow (cuz rdflib) | ||
ontology = ProntoImplementation(OntologyResource(slug=ontology_path, local=True)) | ||
sssom_df = pd.read_csv(sssom_map_path, comment='#', sep='\t') | ||
mondo_termlist_df = pd.read_csv(mondo_terms_path, comment='#', sep='\t') | ||
|
||
# Initialize variables | ||
# todo: is_omim is temporary | ||
source_onto_terms: List[str] = _get_all_term_curies(ontology, is_omim='omim' in os.path.basename(ontology_path)) | ||
sssom_object_ids = set(sssom_df['object_id']) | ||
|
||
# Get slurpable terms | ||
slurpable_terms = [] | ||
for t in source_onto_terms: | ||
if t not in sssom_object_ids: | ||
migrate = True | ||
for p in oaklib.get_direct_parents(t): | ||
if p not in sssom_map['object_id']: | ||
# todo: Find the correct way of doing this: | ||
# todo: probably best to add all t[] at once, given the OAK api | ||
direct_parents: List[str] = _get_direct_parents(t, ontology) | ||
slurpable_parents: List[str] = [] | ||
for parent in direct_parents: | ||
if parent not in sssom_object_ids: | ||
migrate = False | ||
break | ||
elif sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:exactMatch' \ | ||
or sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:narrowMatch': | ||
# In other words, if the parent is mapped, and the mapping is either exact or narrower | ||
parents.append(sssom_map[sssom_map['object_id']==p]['subject_id']) | ||
else: | ||
# Its fine, just continue looking for other parents in this case | ||
if migrate and parents: | ||
next_mondo_id = determine_next_available_mondo_id(min_id, termlist_mondo) # satrting from min_id, then counting up and checking if it does not already exist. | ||
obj_data = sssom_df[sssom_df['object_id'] == parent] | ||
pred = str(obj_data['predicate_id']) | ||
if pred in ['skos:exactMatch', 'skos:narrowMatch']: | ||
# In other words, if the parent is mapped, and the mapping is either exact or narrower, OK to add | ||
# todo: Ok to add | ||
slurpable_parents.append(obj_data['subject_id']) | ||
else: | ||
pass # Its fine, just continue looking for other parents in this case | ||
if migrate and slurpable_parents: | ||
# TODO: implement this func: | ||
next_mondo_id = determine_next_available_mondo_id(min_id, mondo_termlist_df) # satrting from min_id, then counting up and checking if it does not already exist. | ||
# todo: Find the correct way of doing this: | ||
label = oaklib.get_label(t) | ||
# todo: Find the correct way of doing this: | ||
definition = oaklib.get_definition(t) | ||
data.append({'mondo_id':next_mondo_id, 'xref': t, 'label': label, 'definition': definition}) | ||
slurpable_terms.append({'mondo_id': next_mondo_id, 'xref': t, 'label': label, 'definition': definition}) | ||
|
||
result = pd.DataFrame(slurpable_terms) | ||
result.to_csv(outpath, sep="\t") | ||
return result | ||
|
||
|
||
pandas.DataFrame(data).to_csv(fn, sep="\t") | ||
def cli(): | ||
"""Command line interface.""" | ||
package_description = \ | ||
'Slurp pipeline: Integrate new terms from other ontologies into Mondo.' | ||
parser = ArgumentParser(description=package_description) | ||
parser.add_argument( | ||
'-o', '--ontology-path', required=True, | ||
help='Path to ontology file, e.g. an `.owl` file.') | ||
parser.add_argument( | ||
'-m', '--sssom-map-path', required=True, | ||
help='Path to file containing all known Mondo mappings, in SSSOM format.') | ||
parser.add_argument( | ||
'-i', '--min-id', required=True, | ||
help='The ID from which we want to begin searching from in order to locate any currently unslurped terms.') | ||
parser.add_argument( | ||
'-t', '--mondo-terms-path', required=True, | ||
help='Path to a file that contains a list of all Mondo terms.') | ||
parser.add_argument( | ||
'-O', '--outpath', required=True, | ||
help='Path to save the output slurp `.tsv` file, containing list of new terms to integrate into Mondo.') | ||
d: Dict = vars(parser.parse_args()) | ||
# todo: Convert paths to absolute paths, as I've done before? Or expect always be run from src/ontology and ok? | ||
run(**d) | ||
|
||
|
||
if __name__ == '__main__': | ||
run() | ||
cli() |