Merge pull request #686 from Discngine/split_line_enhancement

Update _split_one_line and remove whitespace parameter
biotite-dev · Nov 18, 2024 · 77cf0f6 · 77cf0f6
2 parents 8fd73bf + c465aa5
commit 77cf0f6
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 54 deletions.
diff --git a/src/biotite/structure/io/pdbx/cif.py b/src/biotite/structure/io/pdbx/cif.py
@@ -7,7 +7,6 @@
 __all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"]
 
 import itertools
-import re
 from collections.abc import MutableMapping, Sequence
 import numpy as np
 from biotite.file import (
@@ -357,7 +356,7 @@ def supercomponent_class():
         return CIFBlock
 
     @staticmethod
-    def deserialize(text, expect_whitespace=True):
+    def deserialize(text):
         lines = [line.strip() for line in text.splitlines() if not _is_empty(line)]
 
         if _is_loop_start(lines[0]):
@@ -372,7 +371,7 @@ def deserialize(text, expect_whitespace=True):
 
         lines = _to_single(lines)
         if is_looped:
-            category_dict = CIFCategory._deserialize_looped(lines, expect_whitespace)
+            category_dict = CIFCategory._deserialize_looped(lines)
         else:
             category_dict = CIFCategory._deserialize_single(lines)
         return CIFCategory(category_dict, category_name)
@@ -445,15 +444,15 @@ def _deserialize_single(lines):
         line_i = 0
         while line_i < len(lines):
             line = lines[line_i]
-            parts = _split_one_line(line)
+            parts = list(_split_one_line(line))
             if len(parts) == 2:
                 # Standard case -> name and value in one line
                 name_part, value_part = parts
                 line_i += 1
             elif len(parts) == 1:
                 # Value is a multiline value on the next line
                 name_part = parts[0]
-                parts = _split_one_line(lines[line_i + 1])
+                parts = list(_split_one_line(lines[line_i + 1]))
                 if len(parts) == 1:
                     value_part = parts[0]
                 else:
@@ -467,7 +466,7 @@ def _deserialize_single(lines):
         return category_dict
 
     @staticmethod
-    def _deserialize_looped(lines, expect_whitespace):
+    def _deserialize_looped(lines):
         """
         Process a category where each field has multiple values
         (category is a table).
@@ -490,20 +489,7 @@ def _deserialize_looped(lines, expect_whitespace):
         # row-line-alignment at all and simply cycle through columns
         column_indices = itertools.cycle(range(len(column_names)))
         for data_line in data_lines:
-            # If whitespace is expected in quote protected values,
-            # use regex-based _split_one_line() to split
-            # Otherwise use much more faster whitespace split
-            # and quote removal if applicable.
-            if expect_whitespace:
-                values = _split_one_line(data_line)
-            else:
-                values = data_line.split()
-                for k in range(len(values)):
-                    # Remove quotes
-                    if (values[k][0] == '"' and values[k][-1] == '"') or (
-                        values[k][0] == "'" and values[k][-1] == "'"
-                    ):
-                        values[k] = values[k][1:-1]
+            values = _split_one_line(data_line)
             for val in values:
                 column_index = next(column_indices)
                 column_name = column_names[column_index]
@@ -685,15 +671,7 @@ def __getitem__(self, key):
             # Element is stored in serialized form
             # -> must be deserialized first
             try:
-                # Special optimization for "atom_site":
-                # Even if the values are quote protected,
-                # no whitespace is expected in escaped values
-                # Therefore slow regex-based _split_one_line() call is not necessary
-                if key == "atom_site":
-                    expect_whitespace = False
-                else:
-                    expect_whitespace = True
-                category = CIFCategory.deserialize(category, expect_whitespace)
+                category = CIFCategory.deserialize(category)
             except Exception:
                 raise DeserializationError(f"Failed to deserialize category '{key}'")
             # Update with deserialized object
@@ -1062,29 +1040,27 @@ def _split_one_line(line):
     """
     # Special case of multiline value, where the line starts with ';'
     if line[0] == ";":
-        return [line[1:]]
-
-    # Define the patterns for different types of fields
-    single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)"
-    double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)'
-    unquoted_pattern = r"([^\s]+)"
-
-    # Combine the patterns using alternation
-    combined_pattern = (
-        f"{single_quote_pattern}|{double_quote_pattern}|{unquoted_pattern}"
-    )
-
-    # Find all matches
-    matches = re.findall(combined_pattern, line)
-
-    # Extract non-empty groups from the matches
-    fields = []
-    for match in matches:
-        field = next(group for group in match if group)
-        if field[0] == field[-1] == "'" or field[0] == field[-1] == '"':
-            field = field[1:-1]
-        fields.append(field)
-    return fields
+        yield line[1:]
+    else:
+        # Loop over the line
+        while line:
+            # Strip leading whitespace(s)
+            stripped_line = line.lstrip()
+            # Split the line on whitespace
+            word, _, line = stripped_line.partition(" ")
+            # Handle the case where the word start with a quote
+            if word.startswith(("'", '"')):
+                # Set the separator to the quote found
+                separator = word[0]
+                # Handle the case of a quoted word without space
+                if word.endswith(separator) and len(word) > 1:
+                    # Yield the word without the opening and closing quotes
+                    yield word[1:-1]
+                    continue
+                # split the word on the separator
+                word, _, line = stripped_line[1:].partition(separator)
+
+            yield word
 
 
 def _arrayfy(data):

diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py
@@ -31,7 +31,7 @@ def test_fetch(format, as_file_like):
     if format == "pdb":
         file = pdb.PDBFile.read(file_path_or_obj)
         pdb.get_structure(file)
-    elif format == "pdbx":
+    elif format == "cif":
         file = pdbx.CIFFile.read(file_path_or_obj)
         pdbx.get_structure(file)
     elif format == "bcif":

diff --git a/tests/structure/io/test_pdbx.py b/tests/structure/io/test_pdbx.py
@@ -85,7 +85,7 @@ def test_split_one_line(cif_line, expected_fields):
     """
     Test whether values that have an embedded quote are properly escaped.
     """
-    assert pdbx.cif._split_one_line(cif_line) == expected_fields
+    assert list(pdbx.cif._split_one_line(cif_line)) == expected_fields
 
 
 @pytest.mark.parametrize(