Skip to content

Commit

Permalink
Implement Stable ID Generation for Cell Sets (#93)
Browse files Browse the repository at this point in the history
* Add sorting by key columns in report generation for consistent DataFrame output

* Refactor entity generation order and replace UUID v4 with UUID v5

* Update version to 0.2.3

* Add new assert statement for stable ID generation in test_generate_rdf_graph_with_merge
  • Loading branch information
ubyndr authored Dec 10, 2024
1 parent b3cf832 commit 919ac99
Show file tree
Hide file tree
Showing 5 changed files with 400 additions and 237 deletions.
10 changes: 7 additions & 3 deletions pandasaurus_cxg/anndata_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,12 @@ def co_annotation_report(
"""
# Call the core method to generate the full DataFrame
full_df = self._generate_co_annotation_dataframe(disease, enrich)

# Return only the first 5 columns
return full_df.iloc[:, :5]

def _generate_co_annotation_dataframe(self, disease: Optional[str] = None, enrich: bool = False):
def _generate_co_annotation_dataframe(
self, disease: Optional[str] = None, enrich: bool = False
):
"""
Core method to generate a full co-annotation dataframe.
Expand Down Expand Up @@ -171,7 +172,7 @@ def _generate_co_annotation_dataframe(self, disease: Optional[str] = None, enric
for record in temp_result
]
# unique_result = AnndataAnalyzer._remove_duplicates(result)
self.report_df = pd.DataFrame(
report_df = pd.DataFrame(
[
inner_list[:2]
+ inner_list[5:6]
Expand All @@ -190,6 +191,9 @@ def _generate_co_annotation_dataframe(self, disease: Optional[str] = None, enric
"field_name2_cell_count",
],
)
self.report_df = report_df.sort_values(
["field_name1", "value1", "predicate", "field_name2", "value2"]
).reset_index(drop=True)
return self.report_df

def enriched_co_annotation_report(self, disease: Optional[str] = None):
Expand Down
72 changes: 39 additions & 33 deletions pandasaurus_cxg/graph_generator/graph_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,42 @@ def generate_rdf_graph(self, merge: bool = False):
"""
if len(self.graph) != 0:
return
# preprocess

# generate dataset entity and has_source property
citation_dict = {}
uns = self.ea.enricher_manager.anndata.uns
if citation_field_name in uns.keys():
citation_dict = parse_citation_field_into_dict(uns[citation_field_name])
cxg_versioned_dataset_id = (
citation_dict.get("download_link").split("/")[-1].split(".")[0]
)
dataset_class = URIRef(get_cxg_dataset_url(cxg_versioned_dataset_id))
else:
# if citation_field_name doesn't exist we use random uuid as cxg_versioned_dataset_id
cxg_versioned_dataset_id = str(uuid.uuid4())
dataset_class = URIRef(self.ns[cxg_versioned_dataset_id])
self.graph.add((dataset_class, RDF.type, URIRef(DATASET.get("iri"))))
self.graph.add((dataset_class, RDFS.label, Literal(DATASET.get("label"))))
for key, value in uns.items():
if not isinstance(value, str):
continue
if key == citation_field_name:
for citation_key, citation_value in citation_dict.items():
self.graph.add(
(
dataset_class,
URIRef(self.ns[remove_special_characters(citation_key)]),
Literal(citation_value),
)
)

self.graph.add(
(dataset_class, URIRef(self.ns[remove_special_characters(key)]), Literal(value))
)
has_source = URIRef(HAS_SOURCE["iri"])
self.graph.add((has_source, RDFS.label, Literal(HAS_SOURCE["label"])))

# preprocess for cell clusters
column_group = ["field_name1", "value1"]
df = self.df.sort_values(by=column_group).reset_index(drop=True)
grouped_df = df.groupby(column_group)
Expand Down Expand Up @@ -124,38 +159,9 @@ def generate_rdf_graph(self, merge: bool = False):
) else temp_dict.update({key: value})

if temp_dict not in grouped_dict_uuid.values():
grouped_dict_uuid[str(uuid.uuid4())] = temp_dict

# generate dataset entity and has_source property
uns = self.ea.enricher_manager.anndata.uns
citation_dict = {}
if citation_field_name in uns.keys():
citation_dict = parse_citation_field_into_dict(uns[citation_field_name])
dataset_class = URIRef(
get_cxg_dataset_url(citation_dict.get("download_link").split("/")[-1].split(".")[0])
)
else:
dataset_class = URIRef(self.ns[str(uuid.uuid4())])
self.graph.add((dataset_class, RDF.type, URIRef(DATASET.get("iri"))))
self.graph.add((dataset_class, RDFS.label, Literal(DATASET.get("label"))))
for key, value in uns.items():
if not isinstance(value, str):
continue
if key == citation_field_name:
for citation_key, citation_value in citation_dict.items():
self.graph.add(
(
dataset_class,
URIRef(self.ns[remove_special_characters(citation_key)]),
Literal(citation_value),
)
)

self.graph.add(
(dataset_class, URIRef(self.ns[remove_special_characters(key)]), Literal(value))
)
has_source = URIRef(HAS_SOURCE["iri"])
self.graph.add((has_source, RDFS.label, Literal(HAS_SOURCE["label"])))
grouped_dict_uuid[
str(uuid.uuid5(uuid.UUID(cxg_versioned_dataset_id), str(temp_dict)))
] = temp_dict

# generate a resource for each free-text cell_type annotation and cell_type_ontology_term annotation
cell_set_class = URIRef(CLUSTER.get("iri"))
Expand Down
Loading

0 comments on commit 919ac99

Please sign in to comment.