Cl annotation patch (#66)

* Refactored adding cell_type nodes and consists_of relations * Refactored seed_list to seed_dict * Updated test cases * Updated version to 0.1.12 * Updated poetry.lock
INCATools · Apr 16, 2024 · 0a4765c · 0a4765c
1 parent ef2463e
commit 0a4765c
Show file tree

Hide file tree

Showing 7 changed files with 678 additions and 1,021 deletions.
diff --git a/pandasaurus_cxg/anndata_analyzer.py b/pandasaurus_cxg/anndata_analyzer.py
@@ -175,7 +175,9 @@ def _enrich_co_annotation(enricher: AnndataEnricher):
         enriched_df = enricher.enricher.enriched_df
         if enriched_df.empty:
             return enriched_df
-        return enriched_df[enriched_df["o"].isin(enricher.seed_list)][["s_label", "o_label"]]
+        return enriched_df[enriched_df["o"].isin(list(enricher.seed_dict.keys()))][
+            ["s_label", "o_label"]
+        ]
 
     def _filter_data_and_drop_duplicates(self, field_name_1, field_name_2, disease):
         # Filter the data based on the disease condition

diff --git a/pandasaurus_cxg/anndata_enricher.py b/pandasaurus_cxg/anndata_enricher.py
@@ -46,8 +46,12 @@ def __init__(
             ontology_list_for_slims = ["Cell Ontology"]
         # TODO Do we need to keep whole anndata? Would it be enough to keep the obs only?
         self.anndata = anndata
-        self.seed_list = self.anndata.obs[cell_type_field].unique().tolist()
-        self.enricher = Query(self.seed_list)
+        self.seed_dict = dict(
+            self.anndata.obs.drop_duplicates(subset=[cell_type_field, "cell_type"])[
+                [cell_type_field, "cell_type"]
+            ].values
+        )
+        self.enricher = Query(list(self.seed_dict.keys()))
         try:
             unique_context = self.anndata.obs[
                 [context_field, context_field_label]
@@ -226,7 +230,7 @@ def set_enricher_property_list(self, property_list: List[str]):
         Args:
             property_list (List[str]): The list of properties to include in the enrichment analysis.
         """
-        self.enricher = Query(self.seed_list, property_list)
+        self.enricher = Query(list(self.seed_dict.keys()), property_list)
 
     def validate_slim_list(self, slim_list):
         """Check if any slim term in the given list is invalid.

diff --git a/pandasaurus_cxg/graph_generator/graph_generator.py b/pandasaurus_cxg/graph_generator/graph_generator.py
@@ -147,38 +147,11 @@ def generate_rdf_graph(self):
         # transitive reduction step
         self.graph = graphgen.apply_transitive_reduction(self.graph, [subcluster.toPython()])
 
-    def enrich_rdf_graph(self):
-        """
-        Enrich RDF graph with enriched DataFrame from AnndataEnricher
-
-        Returns:
-
-        """
-        if self.ea.enricher_manager.enricher.enriched_df.empty:
-            # TODO or we can just call simple_enrichment method
-            enrichment_methods = [i for i in dir(AnndataEnricher) if "_enrichment" in i]
-            enrichment_methods.sort()
-            raise MissingEnrichmentProcess(enrichment_methods)
-        cell_type_dict = (
-            pd.concat(
-                [
-                    self.ea.enricher_manager.enricher.enriched_df[["s", "s_label"]],
-                    self.ea.enricher_manager.enricher.enriched_df[["o", "o_label"]].rename(
-                        columns={"o": "s", "o_label": "s_label"}
-                    ),
-                ],
-                axis=0,
-                ignore_index=True,
-            )
-            .drop_duplicates()
-            .set_index("s")["s_label"]
-            .to_dict()
-        )
         # add cell_type nodes and consists_of relations
         cl_namespace = Namespace("http://purl.obolibrary.org/obo/CL_")
         consist_of = URIRef(CONSIST_OF.get("iri"))
         self.graph.add((consist_of, RDFS.label, Literal(CONSIST_OF.get("label"))))
-        for curie, label in cell_type_dict.items():
+        for curie, label in self.ea.enricher_manager.seed_dict.items():
             resource = cl_namespace[curie.split(":")[-1]]
             self.graph.add((resource, RDFS.label, Literal(label)))
             self.graph.add((resource, RDF.type, OWL.Class))
@@ -192,7 +165,19 @@ def enrich_rdf_graph(self):
                 # Add the restriction
                 self.graph.add((s, RDF.type, class_expression_bnode))
 
-        # add enrichment graph
+    def enrich_rdf_graph(self):
+        """
+        Enrich RDF graph with enriched DataFrame from AnndataEnricher
+
+        Returns:
+
+        """
+        if self.ea.enricher_manager.enricher.enriched_df.empty:
+            # TODO or we can just call simple_enrichment method
+            enrichment_methods = [i for i in dir(AnndataEnricher) if "_enrichment" in i]
+            enrichment_methods.sort()
+            raise MissingEnrichmentProcess(enrichment_methods)
+        # add enrichment graph, subClassOf relations
         self.graph += self.ea.enricher_manager.enricher.graph
 
     def save_rdf_graph(