From 39c382483ec35f496ebbcce85a8d2fde5f1db811 Mon Sep 17 00:00:00 2001 From: Ismail Ugur Bayindir Date: Mon, 20 May 2024 15:18:14 +0100 Subject: [PATCH] Doi patch for citation field in uns (#74) * Updated version to 0.1.13 * Added dataset object to rdf representation * Disabled supercluster_of removal to fix missing nodes in graph generation * Added add_metadata_nodes method * Rounded up percentages in add_metadata_nodes * Refactored add_metadata_nodes to utilize OWL reification for improved visualisation in neo4j. * Added parsing mechanism for citation field in uns --- .../graph_generator/graph_generator.py | 14 +++++++- .../graph_generator/graph_generator_utils.py | 32 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/pandasaurus_cxg/graph_generator/graph_generator.py b/pandasaurus_cxg/graph_generator/graph_generator.py index 5bf1baf..7601fb0 100644 --- a/pandasaurus_cxg/graph_generator/graph_generator.py +++ b/pandasaurus_cxg/graph_generator/graph_generator.py @@ -5,7 +5,6 @@ import matplotlib.pyplot as plt import networkx as nx -import pandas as pd from pandasaurus.graph.graph_generator import GraphGenerator as graphgen from rdflib import OWL, RDF, RDFS, BNode, Graph, Literal, Namespace, URIRef from rdflib.plugins.sparql import prepareQuery @@ -19,9 +18,11 @@ add_edge, add_node, add_outgoing_edges_to_subgraph, + citation_field_name, colour_mapping, find_and_rotate_center_layout, generate_subgraph, + parse_citation_field_into_dict, remove_special_characters, select_node_with_property, ) @@ -129,6 +130,17 @@ def generate_rdf_graph(self): for key, value in uns.items(): if not isinstance(value, str): continue + if key == citation_field_name: + citation_dict = parse_citation_field_into_dict(value) + for citation_key, citation_value in citation_dict.items(): + self.graph.add( + ( + dataset_class, + URIRef(self.ns[citation_key]), + Literal(citation_value), + ) + ) + self.graph.add((dataset_class, URIRef(self.ns[key]), Literal(value))) has_source = URIRef(HAS_SOURCE["iri"]) self.graph.add((has_source, RDFS.label, Literal(HAS_SOURCE["label"]))) diff --git a/pandasaurus_cxg/graph_generator/graph_generator_utils.py b/pandasaurus_cxg/graph_generator/graph_generator_utils.py index 845f8f7..511a1ad 100644 --- a/pandasaurus_cxg/graph_generator/graph_generator_utils.py +++ b/pandasaurus_cxg/graph_generator/graph_generator_utils.py @@ -1,4 +1,5 @@ import re +from typing import Dict import networkx as nx from rdflib import OWL, RDF, RDFS, BNode, Graph, Literal, Namespace, URIRef @@ -14,6 +15,8 @@ "http://purl.obolibrary.org/obo/PCL_0010001": "cyan", } +citation_field_name = "citation" + def add_edge(nx_graph: nx.Graph, subject, predicate, obj): edge_data = { @@ -126,3 +129,32 @@ def select_node_with_property(graph: Graph, _property: str, value: str): def remove_special_characters(input_string: str) -> str: return re.sub(r"[^a-zA-Z0-9_]", "", input_string.replace(" ", "_")) + + +def parse_citation_field_into_dict(value: str) -> Dict[str, str]: + """ + Parses a citation string into a dictionary by extracting key citation fields. + + Args: + value: The string containing citation fields and values. + + Returns: + A dictionary with keys such as 'Publication', 'Dataset Version', and 'Collection', + and corresponding values extracted from the input string. + """ + # Split the input string on the key terms + parts = value.split(" ") + keys = ["Publication:", "Version:", "Collection:"] + key_indices = [parts.index(key) for key in keys if key in parts] + # Break down into key-value pairs + key_value_pairs = {} + for i, index in enumerate(key_indices): + current_value = " ".join(parts[index + 1 : index + 2]) + key_value_pairs.update( + { + "download_link" + if parts[index][:-1].lower() == "version" + else parts[index][:-1].lower(): current_value + } + ) + return key_value_pairs