Source code for biobuild.structural.stitch

"""
Functions to stitch molecules together even in the absence of a patch
"""

from typing import Union

import numpy as np

import Bio.PDB as bio

import biobuild.core.base_classes as base_classes
import biobuild.core.Molecule as Molecule
import biobuild.core.Linkage as Linkage
import biobuild.structural.connector as base
import biobuild.structural.base as structural_base
import biobuild.optimizers as optimizers



[docs]
class Stitcher(base.Connector):
    """
    This class is responsible for stitching molecules together
    when there is not a patch available which specifies the immediate
    geometry of their connection. The only required input is a pair of atoms
    that should form a new bond, and two tuples of atoms (at least one from each molecule)
    that are being removed in the process.

    Parameters
    ----------
    copy_target : bool
        Whether to copy the target molecule before stitching
    copy_source : bool
        Whether to copy the source molecule before stitching
    """

    def __init__(self, copy_target: bool = False, copy_source: bool = False):
        super().__init__(copy_target, copy_source)
        self._removals = (None, None)
        self._policy = None, None
        self._optimize_bystander_radius = 8


[docs]
    def apply(
        self,
        target: "Molecule.Molecule",
        source: "Molecule.Molecule",
        target_removals: tuple = None,
        source_removals: tuple = None,
        target_atom: Union[int, "base_classes.Atom"] = None,
        source_atom: Union[int, "base_classes.Atom"] = None,
        target_residue: Union[int, "base_classes.Residue"] = None,
        source_residue: Union[int, "base_classes.Residue"] = None,
        optimization_steps: int = 30,
        **kwargs,
    ) -> "Molecule.Molecule":
        """
        Stitch the source and target molecules together

        Parameters
        ----------
        target : Molecule
            The target molecule
        source : Molecule
            The source molecule. This will be attached to the target molecule.
        target_removals : tuple
            A tuple of atoms to be removed from the target molecule. These must be the atom's ids within the attaching residue.
            All atoms must be part fo the same residue as the attaching `target_atom`.
            If not provided, just any Hydrogen atom next to the `target_atom` will be used.
        source_removals : tuple
            A tuple of atoms to be removed from the source molecule. These must be the atom's ids within the attaching residue.
            All atoms must be part fo the same residue as the attaching `source_atom`.
            If not provided, just any Hydrogen atom next to the `source_atom` will be used.
        target_atom : int or Atom
            The atom on the target molecule to which the source molecule will be attached. This may either be the atom object directly or its serial number.
            If none is provided, the molecule's "root atom" is used (if defined).
        source_atom : int or Atom
            The atom on the source molecule to which the target molecule will be attached. This may either be the atom object directly or its serial number.
            If none is provided, the molecule's "root atom" is used (if defined).
        target_residue : int or Residue
            The residue hosting the target atom. This is only required if the target atom is not given directly or specified by name.
        source_residue : int or Residue
            The residue hosting the source atom. This is only required if the source atom is not given directly or specified by name.
        optimization_steps : int, optional
            The number of steps to take in the optimization process.
        **kwargs
            Additional keyword arguments to pass to the optimizer. See the documentation for `biobuild.optimizers.agents.optimize` for more details.

        Returns
        -------
        tuple
            The target and source molecules after stitching. At this point, the source molecule is aligned
            to the target molecules but not yet integrated into it. Use the `merge` method to do this.
        """
        if not target_removals:
            target_removals = []
        if not source_removals:
            source_removals = []

        if self.copy_target:
            target = target.copy()
            if target_residue:
                if not isinstance(target_residue, int):
                    target_residue = target_residue.id[1]
                target_residue = target.get_residue(target_residue, by="seqid")
                target_atom = target.get_atom(target_atom, residue=target_residue)
        if self.copy_source:
            source = source.copy()
            if source_residue:
                if not isinstance(source_residue, int):
                    source_residue = source_residue.id[1]
                source_residue = source.get_residue(source_residue, by="seqid")
            source_atom = source.get_atom(source_atom, residue=source_residue)

        self.target = target
        self.source = source

        self._anchors = self.get_anchors(
            (target_atom, source_atom), target_residue, source_residue
        )
        self._removals = self._find_removals(target_removals, source_removals)

        self._align_anchors()

        self._remove_atoms()
        self._optimize(optimization_steps, **kwargs)

        return self.target, self.source


    def _align_anchors(self):
        """
        Align the anchor atoms such that they are in the right positions to form a bond
        This will move the source molecule such that its anchor atom is in the position
        of one of the target molecule's removed atoms
        """
        neighs = self.target.get_neighbors(self._anchors[0])
        ref_target_atom = next(i for i in neighs if i in self._removals[0])

        neighs = self.source.get_neighbors(self._anchors[1])
        ref_source_atom = next(i for i in neighs if i in self._removals[1])

        # self._v.draw_point("anchor target", self._anchors[0].coord, color="blue")
        # self._v.draw_point("anchor source", self._anchors[1].coord, color="blue")
        # self._v.draw_point("ref_target_atom", ref_target_atom.coord, color="red")
        # self._v.draw_point("ref_source_atom", ref_source_atom.coord, color="red")

        _old_coords = np.stack(
            [
                self._anchors[1].coord,
                ref_source_atom.coord,
            ]
        )
        _new_coords = np.array(
            [
                ref_target_atom.coord,
                self._anchors[0].coord,
            ]
        )

        # compute translation vector
        old_centroid = _old_coords.mean(axis=0)
        new_centroid = _new_coords.mean(axis=0)

        _relative_old_coords = _old_coords - old_centroid
        _relative_new_coords = _new_coords - new_centroid

        H = (_relative_old_coords).T.dot(_relative_new_coords)
        U, S, VT = np.linalg.svd(H)
        R = VT.T @ U.T

        # self._v.draw_edges(self.source.bonds, color="black", opacity=0.5)
        atom_coords = np.array([atom.coord for atom in self.source.get_atoms()])
        atom_coords = (R @ (atom_coords - old_centroid).T).T + new_centroid
        for coord, atom in zip(atom_coords, self.source.get_atoms()):
            atom.set_coord(coord)

        # for atom in self.source.get_atoms():
        #     vec = atom.coord - old_centroid
        #     new_coord = (R @ vec) + old_centroid + translation_vector
        #     atom.set_coord(new_coord)

        # self._v.draw_edges(self.source.bonds, color="orange", opacity=0.5)
        # self._v.draw_point("anchor source (new)", self._anchors[1].coord, color="teal")

    def _find_removals(self, target_removals, source_removals):
        """
        Find the atoms to remove from the target and source molecules
        while stitching them together
        """
        if len(target_removals) == 0:
            _target_removals = [
                next(
                    i
                    for i in self.target.get_neighbors(self._anchors[0])
                    if i.element == "H"
                )
            ]
        else:
            _target_removals = [
                self.target.get_atom(atom, residue=self._target_residue)
                for atom in target_removals
            ]
        if len(source_removals) == 0:
            _source_removals = [
                next(
                    i
                    for i in self.source.get_neighbors(self._anchors[1])
                    if i.element == "H"
                )
            ]
        else:
            _source_removals = [
                self.source.get_atom(atom, residue=self._source_residue)
                for atom in source_removals
            ]
        return (set(_target_removals), set(_source_removals))

    def _remove_atoms(self):
        """
        Remove the atoms specified in the removals list
        """
        # self.target._remove_atoms(*self._removals[0])
        # self.source._remove_atoms(*self._removals[1])
        # return
        mapping = {
            0: self.target,
            1: self.source,
        }
        for i, removals in enumerate(self._removals):
            obj = mapping[i]

            for atom in removals:
                bonds = (
                    i
                    for i in obj._bonds
                    if atom in i
                    # if atom.full_id == i[0].full_id or atom.full_id == i[1].full_id
                )
                for bond in bonds:
                    # obj._bonds.remove(bond)
                    obj._remove_bond(*bond)
                obj._AtomGraph.remove_node(atom)
                p = atom.get_parent()
                if p:
                    p.detach_child(atom.get_id())
                    # p.remove(atom)
                    # p.child_dict.pop(atom.id)
                    atom.set_parent(p)

            adx = 1
            for atom in obj._model.get_atoms():
                atom.set_serial_number(adx)
                adx += 1

    def _optimize(self, steps: int = 30, **kwargs):
        """
        Optimize the geometry of the source molecule

        Parameters
        ----------
        steps : int
            The number of optimization steps to perform
        """

        self.target.adjust_indexing(self.source)
        bonds = self.target.get_bonds(self._target_residue)
        bonds.extend(self.source.get_bonds(self._source_residue))

        # the double Molecule seems necessary when the file is not direclty run...
        tmp = Molecule.Molecule.empty("tmp")

        orig_coords_target = {
            atom: (atom.coord[0], atom.coord[1], atom.coord[2])
            for atom in self._target_residue.child_list
        }
        orig_coords_source = {
            atom: (atom.coord[0], atom.coord[1], atom.coord[2])
            for atom in self._source_residue.child_list
        }
        target_residue_parent = self._target_residue.get_parent()
        source_residue_parent = self._source_residue.get_parent()

        orig_parents_target = {}
        orig_parents_source = {}

        tmp.add_residues(
            self._target_residue,
            self._source_residue,
            adjust_seqid=False,
            _copy=False,
        )

        # we could also add all close-by atoms
        bystanders = base_classes.Residue("bystanders", 3, "")
        include_bystanders = False
        tmp.add_residues(bystanders, _copy=False)
        r = self._optimize_bystander_radius
        for atom in self.target.get_atoms():
            if structural_base.compute_distance(atom, self._anchors[0]) < r:
                if atom in tmp.get_atoms():
                    continue
                orig_coords_target[atom] = (atom.coord[0], atom.coord[1], atom.coord[2])
                orig_parents_target[atom] = atom.parent

                bystanders.add(atom)
                tmp._AtomGraph.add_node(atom)
                include_bystanders = True

        for atom in self.source.get_atoms():
            if structural_base.compute_distance(atom, self._anchors[1]) < r:
                if atom in tmp.get_atoms():
                    continue
                orig_coords_source[atom] = (atom.coord[0], atom.coord[1], atom.coord[2])
                orig_parents_source[atom] = atom.parent

                bystanders.add(atom)
                tmp._AtomGraph.add_node(atom)
                include_bystanders = True

        tmp._add_bonds(*bonds)
        tmp._add_bond(self._anchors[0], self._anchors[1])

        graph = tmp.make_residue_graph(False)
        graph.make_detailed(n_samples=0.5)
        if include_bystanders:
            graph.add_nodes_from(bystanders.get_atoms())

        edges = graph.find_rotatable_edges()
        env = optimizers.DistanceRotatron(
            graph,
            edges,
            concatenation_function=optimizers.concatenation_function_linear,
        )

        best, _ = optimizers.swarm_optimize(
            env,
            n_particles=kwargs.pop("n_particles", 5),
            max_steps=int(steps),
            **kwargs,
        )
        self._policy = edges, best

        self._target_residue.parent = target_residue_parent
        self._source_residue.parent = source_residue_parent

        for atom in self._target_residue.child_list:
            atom.parent = self._target_residue
            orig_coords = orig_coords_target[atom]
            atom.coord[0] = orig_coords[0]
            atom.coord[1] = orig_coords[1]
            atom.coord[2] = orig_coords[2]

        for atom in self._source_residue.child_list:
            atom.parent = self._source_residue
            orig_coords = orig_coords_source[atom]
            atom.coord[0] = orig_coords[0]
            atom.coord[1] = orig_coords[1]
            atom.coord[2] = orig_coords[2]

        for atom in orig_parents_source:
            atom.parent = orig_parents_source[atom]
            orig_coords = orig_coords_source[atom]
            atom.coord[0] = orig_coords[0]
            atom.coord[1] = orig_coords[1]
            atom.coord[2] = orig_coords[2]

        for atom in orig_parents_target:
            atom.parent = orig_parents_target[atom]
            orig_coords = orig_coords_target[atom]
            atom.coord[0] = orig_coords[0]
            atom.coord[1] = orig_coords[1]
            atom.coord[2] = orig_coords[2]

        # ===============================================================
        # This works really well, but is slow because of the copying
        # ===============================================================
        # tmp.add_residues(
        #     self._target_residue,
        #     self._source_residue,
        #     adjust_seqid=False,
        #     _copy=True,
        # )

        # # we could also add all close-by atoms
        # r = self._optimize_bystander_radius
        # for atom in self.target.get_atoms():
        #     if atom - self._anchors[0] < r or atom - self._anchors[1] < r:
        #         if atom in tmp.get_atoms():
        #             continue
        #         tmp.add_atoms(atom, _copy=True)
        # for atom in self.source.get_atoms():
        #     if atom - self._anchors[0] < r or atom - self._anchors[1] < r:
        #         if atom in tmp.get_atoms():
        #             continue
        #         tmp.add_atoms(atom, _copy=True)

        # for b in bonds:
        #     b = (b[0].serial_number, b[1].serial_number)
        #     tmp.add_bond(*b)
        # tmp.add_bond(self._anchors[0].serial_number, self._anchors[1].serial_number)

        # graph = tmp.make_residue_graph()
        # graph.make_detailed(include_outliers=True, include_heteroatoms=True, f=1.2)

        # edges = sorted(tmp.get_residue_connections())
        # env = optimizers.MultiBondRotatron(graph, edges, mask_same_residues=False)

        # best, _ = optimizers.scipy_optimize(env, int(steps), **kwargs)
        # self._policy = edges, best
        # ================================================================================

        # self.best.append(best)
        # f, reward, *_ = env.step(best)
        # self.rewards.append(reward)

        # self.env = env


[docs]
    def merge(self) -> "Molecule.Molecule":
        """
        Merge the source molecule into the target molecule
        """
        self.target.add_residues(*self.source.residues)
        self.target._bonds.extend(self.source.bonds)
        self.target._AtomGraph.migrate_bonds(self.source._AtomGraph)
        self.target.add_bond(self._anchors[0], self._anchors[1])
        # self.target.reindex()

        # v = bb.utils.visual.MoleculeViewer3D(self.target)

        if self._policy[0] is not None:
            edges, angles = self._policy

            for bond, angle in zip(edges, angles):
                # we used to use full_id which worked overall good. I think we used to have
                # also a version with serial for a while. Let's do this again...
                # UPDATE: By now there should be no issues with the atom's being in there
                # so we don't actually need to use the full_id anymore
                _bond = bond

                # v.draw_vector(
                #     "rotation",
                #     bond[0].coord,
                #     1.1 * (bond[1].coord - bond[0].coord),
                #     color="orange",
                # )

                self.target.rotate_around_bond(
                    *_bond, angle, descendants_only=True, angle_is_degrees=False
                )

            self._policy = None, None

        # v.draw_edges(self.target.bonds, color="red")
        # v.show()
        return self.target




__default_keep_keep_stitcher__ = Stitcher(False, False)
"""
Default stitcher for the case that both molecules are kept
"""

# __default_keep_copy_stitcher__ = Stitcher(False, True)
# """
# Default stitcher for the case that the target molecule is kept and the source molecule is copied
# """

# __default_copy_copy_stitcher__ = Stitcher(True, True)
# """
# Default stitcher for the case that both molecules are copied
# """



[docs]
def stitch(
    target: "Molecule.Molecule",
    source: "Molecule.Molecule",
    recipe: "Linkage.Linkage" = None,
    target_removals: tuple = None,
    source_removals: tuple = None,
    target_atom: Union[int, "base_classes.Atom"] = None,
    source_atom: Union[int, "base_classes.Atom"] = None,
    target_residue: Union[int, "base_classes.Residue"] = None,
    source_residue: Union[int, "base_classes.Residue"] = None,
    optimization_steps: int = 1e4,
    copy_target: bool = False,
    copy_source: bool = False,
    **kwargs,
) -> "Molecule.Molecule":
    """
    Stitch two molecules together.
    Thereby the source molecule is integrated into the target molecule.

    Parameters
    ----------
    target : Molecule
        The target molecule
    source : Molecule
        The source molecule. This will be attached to the target molecule.
    recipe : AbstractRecipe
        The recipe to be used for stitching. If none is provided, the stitching instructions can be submitted manually via the other parameters.
    target_removals : tuple
        This parameter is only used if no recipe is provided.
        A tuple of atoms to be removed from the target molecule. These must be the atom's ids within the attaching residue.
        All atoms must be part fo the same residue as the attaching `target_atom`.
    source_removals : tuple
        This parameter is only used if no recipe is provided.
        A tuple of atoms to be removed from the source molecule. These must be the atom's ids within the attaching residue.
        All atoms must be part fo the same residue as the attaching `source_atom`.
    target_atom : int or Atom
        This parameter is only used if no recipe is provided.
        The atom on the target molecule to which the source molecule will be attached. This may either be the atom object directly or its serial number.
        If none is provided, the molecule's "root atom" is used (if defined).
    source_atom : int or Atom
        This parameter is only used if no recipe is provided.
        The atom on the source molecule to which the target molecule will be attached. This may either be the atom object directly or its serial number.
        If none is provided, the molecule's "root atom" is used (if defined).
    target_residue : int or Residue
        The residue hosting the target atom. This is only required if the target atom is not given directly or specified by name. If a recipe is provided, the "attach_residue" is used by default. A ValueError is raised if none is defined.
    source_residue : int or Residue
        The residue hosting the source atom. This is only required if the source atom is not given directly or specified by name. If a recipe is provided, the "attach_residue" is used by default. A ValueError is raised if none is defined.
    optimization_steps : int, optional
        The number of steps to take in the optimization process, by default 1e4
    copy_target : bool, optional
        Whether to copy the target molecule before stitching, by default False
    copy_source : bool, optional
        Whether to copy the source molecule before stitching, by default False
    **kwargs
        Additional keyword arguments to pass to the optimizer. See the documentation for `biobuild.optimizers.agents.optimize` for more details.

    Returns
    -------
    Molecule
        The stitched molecule
    """
    if recipe is not None:
        if target_residue is None:
            if target.attach_residue is None:
                target_residue = target.residues[-1]
                # raise ValueError(
                #     "A residue in the target molecule must be provided or the target molecule must define an attachment residue"
                # )
            else:
                target_residue = target.attach_residue
        if source_residue is None:
            if source.attach_residue is None:
                source_residue = source.residues[-1]
                # raise ValueError(
                #     "A residue in the source molecule must be provided or the source molecule must define an attachment residue"
                # )
            else:
                source_residue = source.attach_residue

        target_atom, source_atom = recipe._stitch_ref_atoms
        return stitch(
            target=target,
            source=source,
            target_removals=recipe.deletes[0],
            source_removals=recipe.deletes[1],
            target_atom=target_atom,
            source_atom=source_atom,
            target_residue=target_residue,
            source_residue=source_residue,
            optimization_steps=optimization_steps,
            **kwargs,
        )

    # =====================

    if copy_source:
        source = source.copy()
    if copy_target:
        target = target.copy()
    __default_keep_keep_stitcher__.apply(
        target=target,
        source=source,
        target_removals=target_removals,
        source_removals=source_removals,
        target_atom=target_atom,
        source_atom=source_atom,
        target_residue=target_residue,
        source_residue=source_residue,
        optimization_steps=optimization_steps,
        **kwargs,
    )
    return __default_keep_keep_stitcher__.merge()



if __name__ == "__main__":
    import biobuild as bb

    bb.load_sugars()
    glc = bb.Molecule.from_compound("GLC")

    s = Stitcher(False, False)
    s.apply(
        glc,
        glc.copy(),
        # ["O1", "HO1"],
        # [
        #     "HO4",
        # ],
        target_atom="C1",
        source_atom="O4",
    )
    s.merge().show()

    # ser = bb.Molecule.from_pubchem("SER")
    # ser.autolabel()

    # mol3 = bb.Molecule.from_pubchem("tert-butyl acetate")
    # # mol3.autolabel()

    # # attach the red bit to the serine
    # l1 = bb.linkage("C6", "O3", ["H12"], ["HO3"])

    # out = stitch(mol3, ser, l1)
    # out.show()
    # import biobuild as bb

    # glc = bb.Molecule.from_compound("GLC")
    # glc.repeat(4, "14bb")

    # glc2 = bb.Molecule.from_compound("GLC")

    # prot = bb.core.Scaffold.from_pdb(
    #     "/Users/noahhk/GIT/biobuild/support/examples/4tvp.prot.pdb"
    # )
    # prot.infer_bonds(restrict_residues=False)
    # s = Stitcher(True, True)

    # res = prot.find()[0]

    # s.apply(prot, glc, target_removals=(""))

    # # ---------------------------------------------
    # # This is a very nice code piece down here...
    # # ---------------------------------------------
    # # glc2.rotate_around_bond(6, 5, 68)
    # # glc2.rotate_around_bond(3, 4, 41)

    # # v = None
    # # n = 200
    # # with alive_progress.alive_bar(n) as bar:
    # #     for i in range(n):
    # #         s = Stitcher(True, True)
    # #         s.apply(glc, glc2, ("O1", "HO1"), ("HO4",), "C1", "O4", 1e4)
    # #         new_glc = s.merge()

    # #         if v is None:
    # #             v = bb.utils.visual.MoleculeViewer3D(new_glc)
    # #         else:
    # #             v.draw_edges(new_glc.bonds, color="red", opacity=0.02)
    # #         bar()
    # # if v: v.show()

    # # import pandas as pd
    # # import seaborn as sns
    # # import matplotlib.pyplot as plt

    # # df = pd.DataFrame(s.best, columns=["phi", "psi"])
    # # df["reward"] = s.rewards
    # # ax = sns.jointplot(data=df, x="phi", y="psi", kind="kde")
    # # sns.scatterplot(data=df, x="phi", y="psi", hue="reward", ax=ax.ax_joint)

    # # plt.show()
    # # pass