Skip to content

Commit

Permalink
Merge pull request #686 from Discngine/split_line_enhancement
Browse files Browse the repository at this point in the history
Update _split_one_line and remove whitespace parameter
  • Loading branch information
padix-key authored Nov 18, 2024
2 parents 8fd73bf + c465aa5 commit 77cf0f6
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 54 deletions.
80 changes: 28 additions & 52 deletions src/biotite/structure/io/pdbx/cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
__all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"]

import itertools
import re
from collections.abc import MutableMapping, Sequence
import numpy as np
from biotite.file import (
Expand Down Expand Up @@ -357,7 +356,7 @@ def supercomponent_class():
return CIFBlock

@staticmethod
def deserialize(text, expect_whitespace=True):
def deserialize(text):
lines = [line.strip() for line in text.splitlines() if not _is_empty(line)]

if _is_loop_start(lines[0]):
Expand All @@ -372,7 +371,7 @@ def deserialize(text, expect_whitespace=True):

lines = _to_single(lines)
if is_looped:
category_dict = CIFCategory._deserialize_looped(lines, expect_whitespace)
category_dict = CIFCategory._deserialize_looped(lines)
else:
category_dict = CIFCategory._deserialize_single(lines)
return CIFCategory(category_dict, category_name)
Expand Down Expand Up @@ -445,15 +444,15 @@ def _deserialize_single(lines):
line_i = 0
while line_i < len(lines):
line = lines[line_i]
parts = _split_one_line(line)
parts = list(_split_one_line(line))
if len(parts) == 2:
# Standard case -> name and value in one line
name_part, value_part = parts
line_i += 1
elif len(parts) == 1:
# Value is a multiline value on the next line
name_part = parts[0]
parts = _split_one_line(lines[line_i + 1])
parts = list(_split_one_line(lines[line_i + 1]))
if len(parts) == 1:
value_part = parts[0]
else:
Expand All @@ -467,7 +466,7 @@ def _deserialize_single(lines):
return category_dict

@staticmethod
def _deserialize_looped(lines, expect_whitespace):
def _deserialize_looped(lines):
"""
Process a category where each field has multiple values
(category is a table).
Expand All @@ -490,20 +489,7 @@ def _deserialize_looped(lines, expect_whitespace):
# row-line-alignment at all and simply cycle through columns
column_indices = itertools.cycle(range(len(column_names)))
for data_line in data_lines:
# If whitespace is expected in quote protected values,
# use regex-based _split_one_line() to split
# Otherwise use much more faster whitespace split
# and quote removal if applicable.
if expect_whitespace:
values = _split_one_line(data_line)
else:
values = data_line.split()
for k in range(len(values)):
# Remove quotes
if (values[k][0] == '"' and values[k][-1] == '"') or (
values[k][0] == "'" and values[k][-1] == "'"
):
values[k] = values[k][1:-1]
values = _split_one_line(data_line)
for val in values:
column_index = next(column_indices)
column_name = column_names[column_index]
Expand Down Expand Up @@ -685,15 +671,7 @@ def __getitem__(self, key):
# Element is stored in serialized form
# -> must be deserialized first
try:
# Special optimization for "atom_site":
# Even if the values are quote protected,
# no whitespace is expected in escaped values
# Therefore slow regex-based _split_one_line() call is not necessary
if key == "atom_site":
expect_whitespace = False
else:
expect_whitespace = True
category = CIFCategory.deserialize(category, expect_whitespace)
category = CIFCategory.deserialize(category)
except Exception:
raise DeserializationError(f"Failed to deserialize category '{key}'")
# Update with deserialized object
Expand Down Expand Up @@ -1062,29 +1040,27 @@ def _split_one_line(line):
"""
# Special case of multiline value, where the line starts with ';'
if line[0] == ";":
return [line[1:]]

# Define the patterns for different types of fields
single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)"
double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)'
unquoted_pattern = r"([^\s]+)"

# Combine the patterns using alternation
combined_pattern = (
f"{single_quote_pattern}|{double_quote_pattern}|{unquoted_pattern}"
)

# Find all matches
matches = re.findall(combined_pattern, line)

# Extract non-empty groups from the matches
fields = []
for match in matches:
field = next(group for group in match if group)
if field[0] == field[-1] == "'" or field[0] == field[-1] == '"':
field = field[1:-1]
fields.append(field)
return fields
yield line[1:]
else:
# Loop over the line
while line:
# Strip leading whitespace(s)
stripped_line = line.lstrip()
# Split the line on whitespace
word, _, line = stripped_line.partition(" ")
# Handle the case where the word start with a quote
if word.startswith(("'", '"')):
# Set the separator to the quote found
separator = word[0]
# Handle the case of a quoted word without space
if word.endswith(separator) and len(word) > 1:
# Yield the word without the opening and closing quotes
yield word[1:-1]
continue
# split the word on the separator
word, _, line = stripped_line[1:].partition(separator)

yield word


def _arrayfy(data):
Expand Down
2 changes: 1 addition & 1 deletion tests/database/test_rcsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_fetch(format, as_file_like):
if format == "pdb":
file = pdb.PDBFile.read(file_path_or_obj)
pdb.get_structure(file)
elif format == "pdbx":
elif format == "cif":
file = pdbx.CIFFile.read(file_path_or_obj)
pdbx.get_structure(file)
elif format == "bcif":
Expand Down
2 changes: 1 addition & 1 deletion tests/structure/io/test_pdbx.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def test_split_one_line(cif_line, expected_fields):
"""
Test whether values that have an embedded quote are properly escaped.
"""
assert pdbx.cif._split_one_line(cif_line) == expected_fields
assert list(pdbx.cif._split_one_line(cif_line)) == expected_fields


@pytest.mark.parametrize(
Expand Down

0 comments on commit 77cf0f6

Please sign in to comment.