diff --git a/benchmarks/structure/benchmark_superimpose.py b/benchmarks/structure/benchmark_superimpose.py new file mode 100644 index 000000000..a4376ed7e --- /dev/null +++ b/benchmarks/structure/benchmark_superimpose.py @@ -0,0 +1,28 @@ +from pathlib import Path +import pytest +import biotite.structure as struc +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir + + +@pytest.fixture +def atoms(): + pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / "1gya.bcif") + return pdbx.get_structure(pdbx_file) + + +@pytest.mark.benchmark +@pytest.mark.parametrize( + "method", + [ + struc.superimpose, + struc.superimpose_without_outliers, + struc.superimpose_homologs, + struc.superimpose_structural_homologs, + ], +) +def benchmark_superimpose(method, atoms): + """ + Compute superimposition of two structures with the same number of atoms. + """ + method(atoms[0], atoms[1]) diff --git a/doc/apidoc.json b/doc/apidoc.json index 3fc5a98d7..281e9285c 100644 --- a/doc/apidoc.json +++ b/doc/apidoc.json @@ -255,8 +255,9 @@ ], "Superimpositions" : [ "superimpose", - "superimpose_homologs", "superimpose_without_outliers", + "superimpose_homologs", + "superimpose_structural_homologs", "AffineTransformation" ], "Filters" : [ @@ -319,7 +320,8 @@ "average", "rmsd", "rmspd", - "rmsf" + "rmsf", + "tm_score" ], "General analysis" : [ "sasa", diff --git a/src/biotite/sequence/align/alignment.py b/src/biotite/sequence/align/alignment.py index cbbbe1cdd..b13f42de8 100644 --- a/src/biotite/sequence/align/alignment.py +++ b/src/biotite/sequence/align/alignment.py @@ -19,6 +19,7 @@ "score", "find_terminal_gaps", "remove_terminal_gaps", + "remove_gaps", ] @@ -666,6 +667,28 @@ def remove_terminal_gaps(alignment): return alignment[start:stop] +def remove_gaps(alignment): + """ + Remove all gap columns from an alignment. + + Parameters + ---------- + alignment : Alignment + The alignment to be modified. + + Returns + ------- + truncated_alignment : Alignment + The alignment without gap columns. + + See Also + -------- + remove_terminal_gaps : Remove only terminal gap columns + """ + non_gap_mask = (alignment.trace != -1).all(axis=1) + return alignment[non_gap_mask] + + def _is_single_letter(alphabet): """ More relaxed version of :func:`biotite.sequence.alphabet.is_letter_alphabet()`: diff --git a/src/biotite/structure/__init__.py b/src/biotite/structure/__init__.py index c8ecec129..6a5e014d2 100644 --- a/src/biotite/structure/__init__.py +++ b/src/biotite/structure/__init__.py @@ -129,5 +129,6 @@ from .sequence import * from .sse import * from .superimpose import * +from .tm import * from .transform import * # util and segments are used internally diff --git a/src/biotite/structure/superimpose.py b/src/biotite/structure/superimpose.py index d06d0abdb..a0bd49b66 100755 --- a/src/biotite/structure/superimpose.py +++ b/src/biotite/structure/superimpose.py @@ -17,10 +17,13 @@ import numpy as np -from biotite.sequence.align import SubstitutionMatrix, align_optimal, get_codes +from biotite.sequence.align.alignment import get_codes, remove_gaps +from biotite.sequence.align.matrix import SubstitutionMatrix +from biotite.sequence.align.pairwise import align_optimal from biotite.sequence.alphabet import common_alphabet from biotite.sequence.seqtypes import ProteinSequence from biotite.structure.atoms import coord +from biotite.structure.chains import chain_iter from biotite.structure.filter import filter_amino_acids, filter_nucleotides from biotite.structure.geometry import centroid, distance from biotite.structure.sequence import to_sequence @@ -464,10 +467,16 @@ def superimpose_without_outliers( def superimpose_homologs( - fixed, mobile, substitution_matrix=None, gap_penalty=-10, min_anchors=3, **kwargs + fixed, + mobile, + substitution_matrix=None, + gap_penalty=-10, + min_anchors=3, + terminal_penalty=False, + **kwargs, ): r""" - Superimpose one protein or nucleotide chain onto another one, + Superimpose a protein or nucleotide structure onto another one, considering sequence differences and conformational outliers. The method finds corresponding residues by sequence alignment and @@ -480,11 +489,11 @@ def superimpose_homologs( ---------- fixed : AtomArray, shape(n,) or AtomArrayStack, shape(m,n) The fixed structure(s). - Must comprise a single chain. mobile : AtomArray, shape(n,) or AtomArrayStack, shape(m,n) - The structure(s) which is/are superimposed on the `fixed` - structure. - Must comprise a single chain. + The structure(s) which is/are superimposed on the `fixed` structure. + Must contain the same number of chains as `fixed` with corresponding chains + being in the same order. + The specific chain IDs can be different. substitution_matrix : str or SubstitutionMatrix, optional The (name of the) substitution matrix used for sequence alignment. @@ -504,6 +513,9 @@ def superimpose_homologs( `mobile` in this fallback case, an exception is raised. Furthermore, the outlier removal is stopped, if less than `min_anchors` anchors would be left. + terminal_penalty : bool, optional + If set to true, gap penalties are applied to terminal gaps in the sequence + alignment. **kwargs Additional parameters for :func:`superimpose_without_outliers()`. @@ -527,6 +539,7 @@ def superimpose_homologs( -------- superimpose : Superimposition without outlier removal superimpose_without_outliers : Internally used for outlier removal + superimpose_structural_homologs : Better suited for low sequence similarity Notes ----- @@ -540,7 +553,7 @@ def superimpose_homologs( or len(mobile_anchor_indices) < min_anchors ): raise ValueError( - "Structures have too few CA atoms for required number of anchors" + "Structures have too few backbone atoms for required number of anchors" ) anchor_indices = _find_matching_anchors( @@ -548,18 +561,19 @@ def superimpose_homologs( mobile[..., mobile_anchor_indices], substitution_matrix, gap_penalty, + terminal_penalty, ) if len(anchor_indices) < min_anchors: # Fallback: Match all backbone anchors if len(fixed_anchor_indices) != len(mobile_anchor_indices): raise ValueError( "Tried fallback due to low anchor number, " - "but number of CA atoms does not match" + "but number of backbone atoms does not match" ) fixed_anchor_indices = fixed_anchor_indices mobile_anchor_indices = mobile_anchor_indices else: - # The anchor indices point to the CA atoms + # The anchor indices point to the backbone atoms # -> get the corresponding indices for the whole structure fixed_anchor_indices = fixed_anchor_indices[anchor_indices[:, 0]] mobile_anchor_indices = mobile_anchor_indices[anchor_indices[:, 1]] @@ -639,51 +653,52 @@ def _find_matching_anchors( mobile_anchors_atoms, substitution_matrix, gap_penalty, + terminal_penalty, ): """ Find corresponding residues using pairwise sequence alignment. """ - fixed_seq = _to_sequence(fixed_anchor_atoms) - mobile_seq = _to_sequence(mobile_anchors_atoms) - common_alph = common_alphabet([fixed_seq.alphabet, mobile_seq.alphabet]) - if common_alph is None: - raise ValueError("Cannot superimpose peptides with nucleic acids") - - if substitution_matrix is None: - if isinstance(fixed_seq, ProteinSequence): - substitution_matrix = SubstitutionMatrix.std_protein_matrix() - else: - substitution_matrix = SubstitutionMatrix.std_nucleotide_matrix() - elif isinstance(substitution_matrix, str): - substitution_matrix = SubstitutionMatrix( - common_alph, common_alph, substitution_matrix - ) - score_matrix = substitution_matrix.score_matrix() + anchor_list = [] + fixed_seq_offset = 0 + mobile_seq_offset = 0 + for fixed_chain, mobile_chain in zip( + chain_iter(fixed_anchor_atoms), chain_iter(mobile_anchors_atoms), strict=True + ): + # The input is a single chain -> expect a single sequence + fixed_seq = to_sequence(fixed_chain, allow_hetero=True)[0][0] + mobile_seq = to_sequence(mobile_chain, allow_hetero=True)[0][0] + + common_alph = common_alphabet([fixed_seq.alphabet, mobile_seq.alphabet]) + if common_alph is None: + raise ValueError("Cannot superimpose peptides with nucleic acids") + if substitution_matrix is None: + if isinstance(fixed_seq, ProteinSequence): + substitution_matrix = SubstitutionMatrix.std_protein_matrix() + else: + substitution_matrix = SubstitutionMatrix.std_nucleotide_matrix() + elif isinstance(substitution_matrix, str): + substitution_matrix = SubstitutionMatrix( + common_alph, common_alph, substitution_matrix + ) - alignment = align_optimal( - fixed_seq, - mobile_seq, - substitution_matrix, - gap_penalty, - terminal_penalty=False, - max_number=1, - )[0] - alignment_codes = get_codes(alignment) - anchor_mask = ( - # Anchors must be similar amino acids - (score_matrix[alignment_codes[0], alignment_codes[1]] > 0) + alignment = align_optimal( + fixed_seq, + mobile_seq, + substitution_matrix, + gap_penalty, + terminal_penalty=terminal_penalty, + max_number=1, + )[0] # Cannot anchor gaps - & (alignment_codes[0] != -1) - & (alignment_codes[1] != -1) - ) - anchors = alignment.trace[anchor_mask] - return anchors + alignment = remove_gaps(alignment) + ali_codes = get_codes(alignment) + score_matrix = substitution_matrix.score_matrix() + # Anchors must be similar amino acids + anchors = alignment.trace[score_matrix[ali_codes[0], ali_codes[1]] > 0] + anchors += fixed_seq_offset, mobile_seq_offset + fixed_seq_offset += len(fixed_seq) + mobile_seq_offset += len(mobile_seq) + anchor_list.append(anchors) -def _to_sequence(atoms): - sequences, _ = to_sequence(atoms, allow_hetero=True) - if len(sequences) == 0: - raise ValueError("Structure does not contain any amino acids or nucleotides") - if len(sequences) > 1: - raise ValueError("Structure contains multiple chains, but only one is allowed") - return sequences[0] + return np.concatenate(anchor_list, axis=0) diff --git a/src/biotite/structure/tm.py b/src/biotite/structure/tm.py new file mode 100644 index 000000000..508af70a3 --- /dev/null +++ b/src/biotite/structure/tm.py @@ -0,0 +1,579 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +This module provides functions for computing the TM-score between two structures and +for computing the superimposition to do so. +""" + +__name__ = "biotite.structure" +__author__ = "Patrick Kunzmann" +__all__ = [ + "tm_score", + "superimpose_structural_homologs", +] + +import itertools +import numpy as np +from biotite.sequence.align.alignment import get_codes, remove_gaps +from biotite.sequence.align.matrix import SubstitutionMatrix +from biotite.sequence.align.pairwise import align_optimal +from biotite.sequence.seqtypes import PurePositionalSequence +from biotite.structure.filter import filter_amino_acids +from biotite.structure.geometry import distance +from biotite.structure.residues import get_residue_count +from biotite.structure.superimpose import superimpose +from biotite.structure.util import coord_for_atom_name_per_residue + +# Minimum value for d0 +# This is not part of the explanation in the paper, but it is implemented in TM-align +_D0_MIN = 0.5 +# Gap open penalty for hybrid alignment +_HYBRID_PENALTY = -1 +# Gap open penalty for pure TM-based alignment +_TM_GAP_PENALTY = -0.6 +# Arbitrary scale factor to avoid rounding errors when converting scores to integer +_SCORE_SCALING = 100 + + +def tm_score( + reference, subject, reference_indices, subject_indices, reference_length="shorter" +): + """ + Compute the *TM*-score for the given protein structures. :footcite:`Zhang2004` + + Parameters + ---------- + reference, subject : AtomArray or ndarray, dtype=float + The protein structures to be compared. + The number of their atoms may differ from each other. + Alternatively, coordinates can be provided directly as + :class:`ndarray`. + reference_indices, subject_indices : ndarray, dtype=int, shape=(n,) + The indices of the atoms in the reference and subject, respectively, + that correspond to each other. + In consequence, the length of both arrays must be equal. + reference_length : int or {"shorter", "longer", "reference"} + The reference length used to normalize the TM-score. + If "shorter", the number of residues in the smaller structure is used. + If "longer", the number of residues in the larger structure is used. + If "reference", the number of residues in the reference structure is used. + The length can also be provided directly as an integer. + + Returns + ------- + tm_score : float + The *TM*-score for the given structure. + + See also + -------- + superimpose_structural_homologs : + Aims to minimize the *TM*-score between two structures. + It also returns the corresponding atom indices that can be passed to + :func:`tm_score()`. + + Notes + ----- + This functions takes the coordinates as they are. + It is recommended to use superimpose them using + :func:`superimpose_structural_homologs()` before, as that function aims to find a + superimposition that minimizes the *TM*-score. + + References + ---------- + + .. footbibliography:: + + Examples + -------- + + >>> reference = atom_array_stack[0] + >>> subject = atom_array_stack[1] + >>> superimposed, _, ref_indices, sub_indices = superimpose_structural_homologs( + ... reference, subject, max_iterations=1 + ... ) + >>> print(tm_score(reference, superimposed, ref_indices, sub_indices)) + 0.66... + """ + if not np.all(filter_amino_acids(reference)): + raise ValueError("Reference structure must be peptide only") + if not np.all(filter_amino_acids(subject)): + raise ValueError("Subject structure must be peptide only") + ref_length = _get_reference_length( + reference_length, get_residue_count(reference), get_residue_count(subject) + ) + distances = distance(reference[reference_indices], subject[subject_indices]) + return np.sum(_tm_score(distances, ref_length)).item() / ref_length + + +def superimpose_structural_homologs( + fixed, + mobile, + structural_alphabet="3di", + substitution_matrix=None, + max_iterations=float("inf"), + reference_length="shorter", +): + """ + Superimpose two remotely homologous protein structures. + + This method relies on structural similarity between the two given structures, + inspired by the *TM-align algorithm*. :footcite:`Zhang2005`. + Thus, this method is better suited for structurally homologous pairs in the + *twilight zone*, i.e. with low amino acid sequence similarity. + + Parameters + ---------- + fixed : AtomArray, shape(n,) + The fixed structure. + Must contain only peptide chains. + mobile : AtomArray, shape(n,) + The structure which is superimposed on the `fixed` structure. + Must contain only peptide chains. + Must contain the same number of chains as `fixed`. + structural_alphabet : {"3di", "pb"}, optional + The structural alphabet to use for finding corresponding residues using sequence + alignment. + Either *3Di* or *Protein Blocks*. + substitution_matrix : SubstitutionMatrix, optional + The substitution matrix to use for finding corresponding residues using sequence + alignment. + max_iterations : int, optional + The maximum number of iterations to perform in the last step. + reference_length : int or {"shorter", "longer", "reference"} + The reference length used to normalize the TM-score and to compute :math:`d_0`. + If "shorter", the number of residues in the smaller structure is used. + If "longer", the number of residues in the larger structure is used. + If "reference", the number of residues in the fixed structure is used. + The length can also be provided directly as an integer. + + Returns + ------- + fitted : AtomArray or AtomArrayStack + A copy of the `mobile` structure, superimposed on the fixed structure. + transform : AffineTransformation + This object contains the affine transformation(s) that were + applied on `mobile`. + :meth:`AffineTransformation.apply()` can be used to transform + another AtomArray in the same way. + fixed_indices, mobile_indices : ndarray, shape(k,), dtype=int + The indices of the corresponding ``CA`` atoms in the fixed and mobile structure, + respectively. + These atoms were used for the superimposition, if their pairwise distance is + below the :math:`d_0` threshold :footcite:`Zhang2004`. + + See also + -------- + superimpose_homologs : Analogous functionality for structures with high sequence similarity. + + Notes + ----- + The challenge of aligning two structures with different number of residues is + finding the corresponding residues between them. + This algorithm inspired by *TM-align* :footcite:`Zhang2005` uses a 3 step heuristic: + + 1. Find corresponding residues using a structural alphabet alignment and superimpose + the chains based on them. + 2. Refine the corresponding residues using a sequence alignment based on a hybrid + positional substitution matrix: + The scores are a 50/50 combination of the structural alphabet substitution score + and the distance-based TM-score between two residues. + The superimposition is updated based on the new corresponding residues. + 3. Refine the corresponding residues using a sequence alignment with a pure + TM-score based positional substitution matrix. + Update the superimposition based on the new corresponding residues. + Repeat this step until the correspondences are stable. + + References + ---------- + + .. footbibliography:: + + Examples + -------- + + >>> fixed = atom_array_stack[0] + >>> mobile = atom_array_stack[1] + >>> superimposed, _, fix_indices, mob_indices = superimpose_structural_homologs( + ... fixed, mobile, max_iterations=1 + ... ) + >>> print(tm_score(fixed, superimposed, fix_indices, mob_indices)) + 0.66... + >>> print(rmsd(fixed[fix_indices], superimposed[mob_indices])) + 0.82... + """ + # Avoid circular imports + from biotite.structure.alphabet.i3d import to_3di + from biotite.structure.alphabet.pb import to_protein_blocks + + match structural_alphabet.lower(): + case "3di": + conversion_function = to_3di + if substitution_matrix is None: + substitution_matrix = SubstitutionMatrix.std_3di_matrix() + case "pb": + conversion_function = to_protein_blocks + if substitution_matrix is None: + substitution_matrix = SubstitutionMatrix.std_protein_blocks_matrix() + case _: + raise ValueError( + f"Unsupported structural alphabet: '{structural_alphabet}'" + ) + + # Concatenate the structural sequences for simplicity + # In the the sequence alignment, this will make barely a difference compared + # to separate alignments, as there is no gap extension penalty + fixed_seq = _concatenate_sequences(conversion_function(fixed)[0]) + mobile_seq = _concatenate_sequences(conversion_function(mobile)[0]) + fixed_ca_coord = coord_for_atom_name_per_residue(fixed, ["CA"])[0] + mobile_ca_coord = coord_for_atom_name_per_residue(mobile, ["CA"])[0] + # NaN values (i.e. residues without CA atom) would let the superimposition fail + fixed_not_nan_mask = ~np.isnan(fixed_ca_coord).any(axis=-1) + mobile_not_nan_mask = ~np.isnan(mobile_ca_coord).any(axis=-1) + fixed_seq = fixed_seq[fixed_not_nan_mask] + fixed_ca_coord = fixed_ca_coord[fixed_not_nan_mask] + mobile_seq = mobile_seq[mobile_not_nan_mask] + mobile_ca_coord = mobile_ca_coord[mobile_not_nan_mask] + reference_length = _get_reference_length( + reference_length, len(fixed_seq), len(mobile_seq) + ) + + # 1. step + anchors = _find_anchors_structure_based(fixed_seq, mobile_seq, substitution_matrix) + _, transform = superimpose( + *_filter_by_anchors(fixed_ca_coord, mobile_ca_coord, anchors) + ) + superimposed_ca_coord = transform.apply(mobile_ca_coord) + + # 2. step + anchors = _find_anchors_hybrid( + fixed_seq, + mobile_seq, + fixed_ca_coord, + superimposed_ca_coord, + substitution_matrix, + reference_length, + ) + _, transform = superimpose( + *_filter_by_anchors( + fixed_ca_coord, + mobile_ca_coord, + anchors, + superimposed_ca_coord, + reference_length, + ) + ) + superimposed_ca_coord = transform.apply(mobile_ca_coord) + + # 3. step + for n_iterations in itertools.count(1): + previous_anchors = anchors + anchors = _find_anchors_tm_based( + fixed_ca_coord, superimposed_ca_coord, reference_length + ) + _, transform = superimpose( + *_filter_by_anchors( + fixed_ca_coord, + mobile_ca_coord, + anchors, + superimposed_ca_coord, + reference_length, + ) + ) + superimposed_ca_coord = transform.apply(mobile_ca_coord) + if n_iterations >= max_iterations or np.array_equal(previous_anchors, anchors): + break + + # The anchors currently refer to the CA atoms only + # -> map anchors back to all-atom indices + fixed_anchors = np.where(fixed.atom_name == "CA")[0][anchors[:, 0]] + mobile_anchors = np.where(mobile.atom_name == "CA")[0][anchors[:, 1]] + return transform.apply(mobile), transform, fixed_anchors, mobile_anchors + + +def _concatenate_sequences(sequences): + """ + Concatenate the sequences into a single sequence. + + Parameters + ---------- + sequences : list of Sequence + The sequences to concatenate. + + Returns + ------- + sequence : Sequence + The concatenated sequence. + """ + # Start with an empty sequence of the same type + return sum(sequences, start=type(sequences[0])()) + + +def _filter_by_anchors( + fixed_ca_coord, + mobile_ca_coord, + anchors, + superimposed_ca_coord=None, + reference_length=None, +): + """ + Filter the coordinates by the anchor indices. + + Parameters + ---------- + fixed_ca_coord, mobile_ca_coord : ndarray, shape=(n,3) + The coordinates of the CA atoms of the fixed and mobile structure, + respectively. + anchors : ndarray, shape=(k,2) + The anchor indices. + superimposed_ca_coord : ndarray, shape=(m,3), optional + The coordinates of the CA atoms of the superimposed structure. + If given, the anchors are additionally filtered by the distance between the + fixed and superimposed structure, which must be lower than :math:`d_0`. + reference_length : int, optional + The reference length used to compute :math:`d_0`. + Needs to be given if `superimposed_ca_coord` is given. + + Returns + ------- + anchor_fixed_coord, anchor_mobile_coord : ndarray, shape=(k,3) + The anchor coordinates of the fixed and mobile structure. + """ + anchor_fixed_coord = fixed_ca_coord[anchors[:, 0]] + anchor_mobile_coord = mobile_ca_coord[anchors[:, 1]] + if reference_length is not None and superimposed_ca_coord is not None: + anchor_superimposed_coord = superimposed_ca_coord[anchors[:, 1]] + mask = _mask_by_d0_threshold( + anchor_fixed_coord, anchor_superimposed_coord, reference_length + ) + anchor_fixed_coord = anchor_fixed_coord[mask] + anchor_mobile_coord = anchor_mobile_coord[mask] + return anchor_fixed_coord, anchor_mobile_coord + + +def _find_anchors_structure_based(fixed_seq, mobile_seq, substitution_matrix): + alignment = align_optimal( + fixed_seq, + mobile_seq, + substitution_matrix, + gap_penalty=(-_get_median_match_score(substitution_matrix), 0), + terminal_penalty=False, + max_number=1, + )[0] + # Cannot anchor gaps + alignment = remove_gaps(alignment) + # Anchors must be similar amino acids + alignment_codes = get_codes(alignment) + score_matrix = substitution_matrix.score_matrix() + anchor_mask = score_matrix[alignment_codes[0], alignment_codes[1]] > 0 + anchors = alignment.trace[anchor_mask] + return anchors + + +def _find_anchors_hybrid( + fixed_seq, + mobile_seq, + fixed_ca_coord, + mobile_ca_coord, + substitution_matrix, + reference_length, +): + # Bring substitution scores into the range of pairwise TM scores + scale_factor = _get_median_match_score(substitution_matrix) + # Create positional substitution matrix to be able to add the TM-score to it: + # The TM-score is based on the coordinates of a particular residue and not on the + # general symbol in the structural alphabet + # Hence, the shape of the substitution matrix must reflect the number of residues + # instead of the number of symbols in the structural alphabet + positional_matrix, fixed_seq, mobile_seq = substitution_matrix.as_positional( + fixed_seq, + mobile_seq, + ) + + tm_score_matrix = _pairwise_tm_score( + fixed_ca_coord, mobile_ca_coord, reference_length + ) + sa_score_matrix = positional_matrix.score_matrix() + # Scale the score matrix and the gap penalty to avoid rounding errors + # when the score matrix is converted to integer type + hybrid_score_matrix = _SCORE_SCALING * ( + sa_score_matrix / scale_factor + tm_score_matrix + ) + gap_penalty = _SCORE_SCALING * _HYBRID_PENALTY + hybrid_matrix = SubstitutionMatrix( + positional_matrix.get_alphabet1(), + positional_matrix.get_alphabet2(), + hybrid_score_matrix.astype(np.int32), + ) + alignment = align_optimal( + fixed_seq, + mobile_seq, + hybrid_matrix, + (gap_penalty, 0), + terminal_penalty=False, + max_number=1, + )[0] + alignment = remove_gaps(alignment) + anchors = alignment.trace + return anchors + + +def _find_anchors_tm_based(fixed_ca_coord, mobile_ca_coord, reference_length): + # The substitution matrix is positional -> Any positional sequence suffices + fixed_seq = PurePositionalSequence(len(fixed_ca_coord)) + mobile_seq = PurePositionalSequence(len(mobile_ca_coord)) + tm_score_matrix = _SCORE_SCALING * _pairwise_tm_score( + fixed_ca_coord, mobile_ca_coord, reference_length + ) + gap_penalty = _SCORE_SCALING * _TM_GAP_PENALTY + matrix = SubstitutionMatrix( + fixed_seq.alphabet, + mobile_seq.alphabet, + tm_score_matrix.astype(np.int32), + ) + alignment = align_optimal( + fixed_seq, + mobile_seq, + matrix, + (gap_penalty, 0), + terminal_penalty=False, + max_number=1, + )[0] + alignment = remove_gaps(alignment) + anchors = alignment.trace + return anchors + + +def _get_median_match_score(substitution_matrix): + """ + Get the median score of two symbols matching. + + Parameters + ---------- + substitution_matrix : SubstitutionMatrix + The substitution matrix to get the median match score from. + Must be symmetric. + + Returns + ------- + score : int + The median match score. + + Notes + ----- + The median is used instead of the mean, as the score range can be quite large, + especially when the matrix assigns an arbitrary score to the *undefined symbol*. + Furthermore, this ensures that the return value is an integer, which is required + for using it as gap penalty. + """ + return np.median(np.diagonal(substitution_matrix.score_matrix())) + + +def _mask_by_d0_threshold(fixed_ca_coord, mobile_ca_coord, reference_length): + """ + Mask every pairwise distance below the :math:`d_0` threshold. + + Parameters + ---------- + fixed_ca_coord, mobile_ca_coord : ndarray, shape=(n,3) + The coordinates of the CA atoms of the fixed and mobile structure whose distance + is measured. + reference_length : int + The reference length used to compute :math:`d_0`. + + Returns + ------- + mask : ndarray, shape=(n,), dtype=bool + A boolean mask that indicates which distances are below the :math:`d_0` + threshold. + """ + mask = distance(fixed_ca_coord, mobile_ca_coord) < _d0(reference_length) + if not np.any(mask): + raise ValueError("No anchors found") + return mask + + +def _pairwise_tm_score(reference_coord, subject_coord, reference_length): + """ + Compute the TM score for the Cartesian product of two coordinate arrays. + + Parameters + ---------- + reference_coord, subject_coord : ndarray, shape=(p,3) or shape=(q,3), dtype=float + The coordinates of the CA atoms to compute all pairwise distances between. + reference_length : int + The reference length used to compute :math:`d_0`. + + Returns + ------- + tm_score : ndarray, shape=(p,q), dtype=float + The TM score for the Cartesian product of the two coordinate arrays. + """ + distance_matrix = distance( + reference_coord[:, np.newaxis, :], + subject_coord[np.newaxis, :, :], + ) + return _tm_score(distance_matrix, reference_length) + + +def _tm_score(distance, reference_length): + """ + Compute the TM score for the given distances. + + Parameters + ---------- + distance : float or ndarray + The distance(s) between the CA atoms of two residues. + reference_length : int + The reference length used to compute :math:`d_0`. + + Returns + ------- + tm_score : float or ndarray + The TM score for the given distances. + """ + d0 = max(_d0(reference_length), _D0_MIN) + return 1 / (1 + (distance / d0) ** 2) + + +def _d0(reference_length): + """ + Compute the :math:`d_0` threshold. + + Parameters + ---------- + reference_length : int + The reference length used to compute :math:`d_0`. + + Returns + ------- + d0 : float + The :math:`d_0` threshold. + """ + # Constants taken from Zhang2004 + return 1.24 * (reference_length - 15) ** (1 / 3) - 1.8 + + +def _get_reference_length(user_parameter, reference_length, subject_length): + """ + Get the reference length to normalize the TM-score and compute :math:`d_0`. + + Parameters + ---------- + user_parameter : int or {"shorter", "longer", "reference"} + The value given by the user via the `reference_length` parameter. + reference_length, subject_length : int + The lengths of the reference and subject structure, respectively. + """ + match user_parameter: + case "shorter": + return min(reference_length, subject_length) + case "longer": + return max(reference_length, subject_length) + case "reference": + return reference_length + case int(number): + return number + case _: + raise ValueError(f"Unsupported reference length: '{user_parameter}'") diff --git a/tests/structure/data/homologs/1gl4.bcif b/tests/structure/data/homologs/1gl4.bcif new file mode 100644 index 000000000..875823d41 Binary files /dev/null and b/tests/structure/data/homologs/1gl4.bcif differ diff --git a/tests/structure/data/homologs/1hml.bcif b/tests/structure/data/homologs/1hml.bcif new file mode 100644 index 000000000..e824f1391 Binary files /dev/null and b/tests/structure/data/homologs/1hml.bcif differ diff --git a/tests/structure/data/homologs/1p4k.bcif b/tests/structure/data/homologs/1p4k.bcif new file mode 100644 index 000000000..7b4074d6d Binary files /dev/null and b/tests/structure/data/homologs/1p4k.bcif differ diff --git a/tests/structure/data/homologs/1qgi.bcif b/tests/structure/data/homologs/1qgi.bcif new file mode 100644 index 000000000..ff2dcd71b Binary files /dev/null and b/tests/structure/data/homologs/1qgi.bcif differ diff --git a/tests/structure/data/homologs/2nwd.bcif b/tests/structure/data/homologs/2nwd.bcif new file mode 100644 index 000000000..df11d3a97 Binary files /dev/null and b/tests/structure/data/homologs/2nwd.bcif differ diff --git a/tests/structure/data/homologs/3kcs.bcif b/tests/structure/data/homologs/3kcs.bcif new file mode 100644 index 000000000..fa37868f4 Binary files /dev/null and b/tests/structure/data/homologs/3kcs.bcif differ diff --git a/tests/structure/data/homologs/3lsj.bcif b/tests/structure/data/homologs/3lsj.bcif new file mode 100644 index 000000000..fd58e46e2 Binary files /dev/null and b/tests/structure/data/homologs/3lsj.bcif differ diff --git a/tests/structure/data/homologs/3lzm.bcif b/tests/structure/data/homologs/3lzm.bcif new file mode 100644 index 000000000..8760053a1 Binary files /dev/null and b/tests/structure/data/homologs/3lzm.bcif differ diff --git a/tests/structure/data/homologs/3rd3.bcif b/tests/structure/data/homologs/3rd3.bcif new file mode 100644 index 000000000..cf7059420 Binary files /dev/null and b/tests/structure/data/homologs/3rd3.bcif differ diff --git a/tests/structure/data/homologs/4osx.bcif b/tests/structure/data/homologs/4osx.bcif new file mode 100644 index 000000000..89d0f0567 Binary files /dev/null and b/tests/structure/data/homologs/4osx.bcif differ diff --git a/tests/structure/data/homologs/6oa8.bcif b/tests/structure/data/homologs/6oa8.bcif new file mode 100644 index 000000000..c83c923ee Binary files /dev/null and b/tests/structure/data/homologs/6oa8.bcif differ diff --git a/tests/structure/data/homologs/README.rst b/tests/structure/data/homologs/README.rst new file mode 100644 index 000000000..2b7920377 --- /dev/null +++ b/tests/structure/data/homologs/README.rst @@ -0,0 +1,9 @@ +Pairs of protein structures with structural homology +==================================================== + +1P4K + 4OSX: Dimer with high sequence and structure similarity +3LSJ + 3RD3: Dimer with high structural similarity but low sequence homology +2NWD + 1HML: Structure with high sequence and structure similarity +3KCS + 6OA8: Structure with high sequence and structure similarity +2NWD + 1QGI: Structure with medium structural similarity but low sequence homology +3KCS + 1GL4: Structure with medium structural similarity but low sequence homology, one of the structures has two chains \ No newline at end of file diff --git a/tests/structure/data/tm/README.rst b/tests/structure/data/tm/README.rst new file mode 100644 index 000000000..4ed2db425 --- /dev/null +++ b/tests/structure/data/tm/README.rst @@ -0,0 +1,6 @@ +Structures with associated TM-score +=================================== + +`tm_scores.json` contains the TM-scores for the test structures. +The score can be reproduced by running ``tm.py``. +This requires the installation of the ``USalign`` software. \ No newline at end of file diff --git a/tests/structure/data/tm/tm.py b/tests/structure/data/tm/tm.py new file mode 100644 index 000000000..a38038a94 --- /dev/null +++ b/tests/structure/data/tm/tm.py @@ -0,0 +1,46 @@ +import json +import re +import subprocess +import tempfile +from pathlib import Path +import biotite.database.rcsb as rcsb +import biotite.structure.io.pdbx as pdbx + +PDB_IDS = ["1l2y", "1gya"] + + +def atoms_to_temporary_cif(atoms): + file = pdbx.CIFFile() + pdbx.set_structure(file, atoms) + tmp_file = tempfile.NamedTemporaryFile(suffix=".cif", mode="w") + file.write(tmp_file) + tmp_file.flush() + return tmp_file + + +def tm_score_from_us_align(reference, subject): + reference_file = atoms_to_temporary_cif(reference) + subject_file = atoms_to_temporary_cif(subject) + + # Do not run superposition to be able to use the original structure in the test + # -> "-se" + completed_process = subprocess.run( + ["USalign", "-se", subject_file.name, reference_file.name], + check=True, + capture_output=True, + text=True, + ) + tm_score_match = re.search(r"TM-score= ([\d|\.]*)", completed_process.stdout) + return float(tm_score_match.group(1)) + + +if __name__ == "__main__": + tm_scores = {} + for pdb_id in PDB_IDS: + pdbx_file = pdbx.BinaryCIFFile.read(rcsb.fetch(pdb_id, "bcif")) + atoms = pdbx.get_structure(pdbx_file) + tm_scores[pdb_id] = [ + tm_score_from_us_align(atoms[0], subject) for subject in atoms + ] + with open(Path(__file__).parent / "tm_scores.json", "w") as file: + json.dump(tm_scores, file, indent=4) diff --git a/tests/structure/data/tm/tm_scores.json b/tests/structure/data/tm/tm_scores.json new file mode 100644 index 000000000..4ed322995 --- /dev/null +++ b/tests/structure/data/tm/tm_scores.json @@ -0,0 +1,62 @@ +{ + "1l2y": [ + 1.0, + 0.6979, + 0.5887, + 0.58205, + 0.74744, + 0.60918, + 0.64239, + 0.62509, + 0.63319, + 0.68341, + 0.66153, + 0.54604, + 0.58137, + 0.61581, + 0.69548, + 0.69721, + 0.80878, + 0.71573, + 0.72984, + 0.69939, + 0.63919, + 0.70722, + 0.63822, + 0.51971, + 0.53375, + 0.55052, + 0.58181, + 0.66779, + 0.67314, + 0.7179, + 0.74457, + 0.63097, + 0.73232, + 0.67231, + 0.65353, + 0.65955, + 0.62962, + 0.70269 + ], + "1gya": [ + 1.0, + 0.90104, + 0.91366, + 0.87609, + 0.8881, + 0.92078, + 0.91662, + 0.90203, + 0.90914, + 0.91415, + 0.92407, + 0.89687, + 0.89906, + 0.91444, + 0.89984, + 0.89861, + 0.89149, + 0.91315 + ] +} \ No newline at end of file diff --git a/tests/structure/test_tm.py b/tests/structure/test_tm.py new file mode 100644 index 000000000..c02b7b486 --- /dev/null +++ b/tests/structure/test_tm.py @@ -0,0 +1,184 @@ +import json +from pathlib import Path +import numpy as np +import pytest +import biotite.structure as struc +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir + + +@pytest.fixture +def without_tm_gap_penalty(): + """ + Set the gap penalty for the iterative alignment step to 0 + """ + import biotite.structure.tm + + original_gap_penalty = biotite.structure.tm._TM_GAP_PENALTY + biotite.structure.tm._TM_GAP_PENALTY = 0 + yield + biotite.structure.tm._TM_GAP_PENALTY = original_gap_penalty + + +@pytest.mark.parametrize("reference_length", ["shorter", "longer", "reference", 20]) +def test_tm_score_perfect(reference_length): + """ + Check if the TM-score of a structure with itself as reference is 1.0. + + Test different reference lengths here as well. + """ + pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / "1l2y.bcif") + atoms = pdbx.get_structure(pdbx_file, model=1) + ca_indices = np.where(atoms.atom_name == "CA")[0] + + assert struc.tm_score(atoms, atoms, ca_indices, ca_indices, reference_length) == 1.0 + + +@pytest.mark.parametrize("pdb_id", ["1l2y", "1gya"]) +def test_tm_score_consistency(pdb_id): + """ + Check if the TM-score is correctly computed by comparing it to the result of + *USalign*. + To decouple TM-score calculation from :func:`superimpose_structural_homologs()`, + the TM-score is calculated for two models of the same length. + """ + with open(Path(data_dir("structure")) / "tm" / "tm_scores.json", "r") as file: + ref_tm_scores = json.load(file)[pdb_id] + + pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / f"{pdb_id}.bcif") + atoms = pdbx.get_structure(pdbx_file) + atoms = atoms[:, struc.filter_amino_acids(atoms)] + reference = atoms[0] + ca_indices = np.where(atoms.atom_name == "CA")[0] + + test_tm_scores = [ + struc.tm_score(reference, atoms[i], ca_indices, ca_indices) + for i in range(0, atoms.stack_depth()) + ] + + assert test_tm_scores == pytest.approx(ref_tm_scores, abs=1e-2) + + +@pytest.mark.parametrize("seed", range(5)) +@pytest.mark.parametrize("structural_alphabet", ["3Di", "PB"]) +def test_superimpose_identical(without_tm_gap_penalty, seed, structural_alphabet): + """ + Check if :func:`superimpose_structural_homologs()` is able to superimpose + two identical complexes with randomized deletions. + As the alignment should detect the deletions, the superimposed + RMSD should be 0 and the TM-score should be 1. + + For the iterative alignment step the gap penalty is set to 0, to avoid the situation + where non-corresponding residues are aligned to avoid the gap penalty. + """ + P_CONSERVATION = 0.8 + + pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / "1aki.bcif") + atoms = pdbx.get_structure(pdbx_file, model=1) + atoms = atoms[struc.filter_amino_acids(atoms)] + + # Delete random residues + fixed = atoms + rng = np.random.default_rng(seed) + # Only delete residues in one structure to avoid cases where non-corresponding + # residues are rightfully aligned, for example the deletions + # 01234-6789 + # 0123-56789 + # would align to + # 012346789 + # 012356789 + # The non-corresponding positions '4' and '5' would align to each other in this case + mobile = _delete_random_residues(atoms, rng, P_CONSERVATION) + # Randomly move structure to increase the challenge + mobile.coord = _transform_random_affine(mobile.coord, rng) + + super, _, fix_indices, mob_indices = struc.superimpose_structural_homologs( + # Define max-iterations to avoid infinite loop if something goes wrong + fixed, + mobile, + structural_alphabet, + max_iterations=100, + ) + + # The superimposition anchors must be CA atoms + assert np.all(fixed.atom_name[fix_indices] == "CA") + assert np.all(mobile.atom_name[mob_indices] == "CA") + # Expect that the found corresponding residues are actually the same residues from + # the original structure in most cases + assert fixed.res_id[fix_indices].tolist() == mobile.res_id[mob_indices].tolist() + assert struc.tm_score(fixed, super, fix_indices, mob_indices) == pytest.approx( + 1.0, abs=1e-3 + ) + assert struc.rmsd(fixed[fix_indices], super[mob_indices]) == pytest.approx( + 0.0, abs=1e-3 + ) + + +@pytest.mark.parametrize( + "fixed_pdb_id, mobile_pdb_id, ref_tm_score", + [ + ("1p4k", "4osx", 0.87), + ("3lsj", "3rd3", 0.78), + ("2nwd", "1hml", 0.91), + ("3kcs", "6oa8", 0.93), + ("2nwd", "1qgi", 0.55), + ("3kcs", "1gl4", 0.82), + ], +) +def test_superimpose_consistency(fixed_pdb_id, mobile_pdb_id, ref_tm_score): + """ + Check if two complexes with high structure similarity, can be properly superimposed + with :func:`superimpose_structural_homologs()`, even if sequence homology is low. + The performance is evaluated in terms of the TM-score compared to the result of + *US-align*. + + The chosen structure pairs have at least a TM-score of 0.5 according to *US-align*. + This ensures that the structures have 'about the same fold' and therefore the + superimposition is not spurious. + + *US-align* is used instead of *TM-align* to be able to align multimeric structures. + """ + # Sometimes US-align might perform slightly better + SCORE_TOLERANCE = 0.05 + + fixed = _get_peptide_assembly( + Path(data_dir("structure")) / "homologs" / f"{fixed_pdb_id}.bcif" + ) + mobile = _get_peptide_assembly( + Path(data_dir("structure")) / "homologs" / f"{mobile_pdb_id}.bcif" + ) + + super, _, fix_indices, mob_indices = struc.superimpose_structural_homologs( + fixed, + mobile, + ) + assert ( + struc.tm_score(fixed, super, fix_indices, mob_indices) + >= ref_tm_score - SCORE_TOLERANCE + ) + + +def _transform_random_affine(coord, rng): + coord = struc.translate(coord, rng.uniform(low=0, high=10, size=3)) + coord = struc.rotate(coord, rng.uniform(low=0, high=2 * np.pi, size=3)) + return coord + + +def _delete_random_residues(atoms, rng, p_conservation): + residue_starts = struc.get_residue_starts(atoms) + conserved_residue_starts = rng.choice( + residue_starts, size=int(p_conservation * len(residue_starts)), replace=False + ) + conservation_mask = np.any( + struc.get_residue_masks(atoms, conserved_residue_starts), axis=0 + ) + return atoms[..., conservation_mask] + + +def _get_peptide_assembly(bcif_file_path): + """ + Load assembly from a BinaryCIF file and filter peptide residues. + """ + pdbx_file = pdbx.BinaryCIFFile.read(bcif_file_path) + atoms = pdbx.get_assembly(pdbx_file, model=1) + return atoms[struc.filter_amino_acids(atoms)] diff --git a/tests/test_doctest.py b/tests/test_doctest.py index 08307689a..6da470bf8 100644 --- a/tests/test_doctest.py +++ b/tests/test_doctest.py @@ -10,7 +10,6 @@ from os.path import join import numpy as np import pytest -import biotite.structure as struc import biotite.structure.io as strucio from tests.util import cannot_connect_to, cannot_import, is_not_installed @@ -201,9 +200,8 @@ def test_doctest(package_name, context_package_names): globs["np"] = np # Add frequently used objects atoms = strucio.load_structure( - join(".", "tests", "structure", "data", "1l2y.bcif"), + join(".", "tests", "structure", "data", "1l2y.bcif"), include_bonds=True ) - atoms.bonds = struc.connect_via_residue_names(atoms) globs["atom_array_stack"] = atoms globs["atom_array"] = globs["atom_array_stack"][0]