Source code for biobuild.structural.stitch

"""
Functions to stitch molecules together even in the absence of a patch
"""

from typing import Union

import numpy as np

import Bio.PDB as bio

import biobuild.core.base_classes as base_classes
import biobuild.core.Molecule as Molecule
import biobuild.core.Linkage as Linkage
import biobuild.structural.connector as base
import biobuild.structural.base as structural_base
import biobuild.optimizers as optimizers


[docs] class Stitcher(base.Connector): """ This class is responsible for stitching molecules together when there is not a patch available which specifies the immediate geometry of their connection. The only required input is a pair of atoms that should form a new bond, and two tuples of atoms (at least one from each molecule) that are being removed in the process. Parameters ---------- copy_target : bool Whether to copy the target molecule before stitching copy_source : bool Whether to copy the source molecule before stitching """ def __init__(self, copy_target: bool = False, copy_source: bool = False): super().__init__(copy_target, copy_source) self._removals = (None, None) self._policy = None, None self._optimize_bystander_radius = 8
[docs] def apply( self, target: "Molecule.Molecule", source: "Molecule.Molecule", target_removals: tuple = None, source_removals: tuple = None, target_atom: Union[int, "base_classes.Atom"] = None, source_atom: Union[int, "base_classes.Atom"] = None, target_residue: Union[int, "base_classes.Residue"] = None, source_residue: Union[int, "base_classes.Residue"] = None, optimization_steps: int = 30, **kwargs, ) -> "Molecule.Molecule": """ Stitch the source and target molecules together Parameters ---------- target : Molecule The target molecule source : Molecule The source molecule. This will be attached to the target molecule. target_removals : tuple A tuple of atoms to be removed from the target molecule. These must be the atom's ids within the attaching residue. All atoms must be part fo the same residue as the attaching `target_atom`. If not provided, just any Hydrogen atom next to the `target_atom` will be used. source_removals : tuple A tuple of atoms to be removed from the source molecule. These must be the atom's ids within the attaching residue. All atoms must be part fo the same residue as the attaching `source_atom`. If not provided, just any Hydrogen atom next to the `source_atom` will be used. target_atom : int or Atom The atom on the target molecule to which the source molecule will be attached. This may either be the atom object directly or its serial number. If none is provided, the molecule's "root atom" is used (if defined). source_atom : int or Atom The atom on the source molecule to which the target molecule will be attached. This may either be the atom object directly or its serial number. If none is provided, the molecule's "root atom" is used (if defined). target_residue : int or Residue The residue hosting the target atom. This is only required if the target atom is not given directly or specified by name. source_residue : int or Residue The residue hosting the source atom. This is only required if the source atom is not given directly or specified by name. optimization_steps : int, optional The number of steps to take in the optimization process. **kwargs Additional keyword arguments to pass to the optimizer. See the documentation for `biobuild.optimizers.agents.optimize` for more details. Returns ------- tuple The target and source molecules after stitching. At this point, the source molecule is aligned to the target molecules but not yet integrated into it. Use the `merge` method to do this. """ if not target_removals: target_removals = [] if not source_removals: source_removals = [] if self.copy_target: target = target.copy() if target_residue: if not isinstance(target_residue, int): target_residue = target_residue.id[1] target_residue = target.get_residue(target_residue, by="seqid") target_atom = target.get_atom(target_atom, residue=target_residue) if self.copy_source: source = source.copy() if source_residue: if not isinstance(source_residue, int): source_residue = source_residue.id[1] source_residue = source.get_residue(source_residue, by="seqid") source_atom = source.get_atom(source_atom, residue=source_residue) self.target = target self.source = source self._anchors = self.get_anchors( (target_atom, source_atom), target_residue, source_residue ) self._removals = self._find_removals(target_removals, source_removals) self._align_anchors() self._remove_atoms() self._optimize(optimization_steps, **kwargs) return self.target, self.source
def _align_anchors(self): """ Align the anchor atoms such that they are in the right positions to form a bond This will move the source molecule such that its anchor atom is in the position of one of the target molecule's removed atoms """ neighs = self.target.get_neighbors(self._anchors[0]) ref_target_atom = next(i for i in neighs if i in self._removals[0]) neighs = self.source.get_neighbors(self._anchors[1]) ref_source_atom = next(i for i in neighs if i in self._removals[1]) # self._v.draw_point("anchor target", self._anchors[0].coord, color="blue") # self._v.draw_point("anchor source", self._anchors[1].coord, color="blue") # self._v.draw_point("ref_target_atom", ref_target_atom.coord, color="red") # self._v.draw_point("ref_source_atom", ref_source_atom.coord, color="red") _old_coords = np.stack( [ self._anchors[1].coord, ref_source_atom.coord, ] ) _new_coords = np.array( [ ref_target_atom.coord, self._anchors[0].coord, ] ) # compute translation vector old_centroid = _old_coords.mean(axis=0) new_centroid = _new_coords.mean(axis=0) _relative_old_coords = _old_coords - old_centroid _relative_new_coords = _new_coords - new_centroid H = (_relative_old_coords).T.dot(_relative_new_coords) U, S, VT = np.linalg.svd(H) R = VT.T @ U.T # self._v.draw_edges(self.source.bonds, color="black", opacity=0.5) atom_coords = np.array([atom.coord for atom in self.source.get_atoms()]) atom_coords = (R @ (atom_coords - old_centroid).T).T + new_centroid for coord, atom in zip(atom_coords, self.source.get_atoms()): atom.set_coord(coord) # for atom in self.source.get_atoms(): # vec = atom.coord - old_centroid # new_coord = (R @ vec) + old_centroid + translation_vector # atom.set_coord(new_coord) # self._v.draw_edges(self.source.bonds, color="orange", opacity=0.5) # self._v.draw_point("anchor source (new)", self._anchors[1].coord, color="teal") def _find_removals(self, target_removals, source_removals): """ Find the atoms to remove from the target and source molecules while stitching them together """ if len(target_removals) == 0: _target_removals = [ next( i for i in self.target.get_neighbors(self._anchors[0]) if i.element == "H" ) ] else: _target_removals = [ self.target.get_atom(atom, residue=self._target_residue) for atom in target_removals ] if len(source_removals) == 0: _source_removals = [ next( i for i in self.source.get_neighbors(self._anchors[1]) if i.element == "H" ) ] else: _source_removals = [ self.source.get_atom(atom, residue=self._source_residue) for atom in source_removals ] return (set(_target_removals), set(_source_removals)) def _remove_atoms(self): """ Remove the atoms specified in the removals list """ # self.target._remove_atoms(*self._removals[0]) # self.source._remove_atoms(*self._removals[1]) # return mapping = { 0: self.target, 1: self.source, } for i, removals in enumerate(self._removals): obj = mapping[i] for atom in removals: bonds = ( i for i in obj._bonds if atom in i # if atom.full_id == i[0].full_id or atom.full_id == i[1].full_id ) for bond in bonds: # obj._bonds.remove(bond) obj._remove_bond(*bond) obj._AtomGraph.remove_node(atom) p = atom.get_parent() if p: p.detach_child(atom.get_id()) # p.remove(atom) # p.child_dict.pop(atom.id) atom.set_parent(p) adx = 1 for atom in obj._model.get_atoms(): atom.set_serial_number(adx) adx += 1 def _optimize(self, steps: int = 30, **kwargs): """ Optimize the geometry of the source molecule Parameters ---------- steps : int The number of optimization steps to perform """ self.target.adjust_indexing(self.source) bonds = self.target.get_bonds(self._target_residue) bonds.extend(self.source.get_bonds(self._source_residue)) # the double Molecule seems necessary when the file is not direclty run... tmp = Molecule.Molecule.empty("tmp") orig_coords_target = { atom: (atom.coord[0], atom.coord[1], atom.coord[2]) for atom in self._target_residue.child_list } orig_coords_source = { atom: (atom.coord[0], atom.coord[1], atom.coord[2]) for atom in self._source_residue.child_list } target_residue_parent = self._target_residue.get_parent() source_residue_parent = self._source_residue.get_parent() orig_parents_target = {} orig_parents_source = {} tmp.add_residues( self._target_residue, self._source_residue, adjust_seqid=False, _copy=False, ) # we could also add all close-by atoms bystanders = base_classes.Residue("bystanders", 3, "") include_bystanders = False tmp.add_residues(bystanders, _copy=False) r = self._optimize_bystander_radius for atom in self.target.get_atoms(): if structural_base.compute_distance(atom, self._anchors[0]) < r: if atom in tmp.get_atoms(): continue orig_coords_target[atom] = (atom.coord[0], atom.coord[1], atom.coord[2]) orig_parents_target[atom] = atom.parent bystanders.add(atom) tmp._AtomGraph.add_node(atom) include_bystanders = True for atom in self.source.get_atoms(): if structural_base.compute_distance(atom, self._anchors[1]) < r: if atom in tmp.get_atoms(): continue orig_coords_source[atom] = (atom.coord[0], atom.coord[1], atom.coord[2]) orig_parents_source[atom] = atom.parent bystanders.add(atom) tmp._AtomGraph.add_node(atom) include_bystanders = True tmp._add_bonds(*bonds) tmp._add_bond(self._anchors[0], self._anchors[1]) graph = tmp.make_residue_graph(False) graph.make_detailed(n_samples=0.5) if include_bystanders: graph.add_nodes_from(bystanders.get_atoms()) edges = graph.find_rotatable_edges() env = optimizers.DistanceRotatron( graph, edges, concatenation_function=optimizers.concatenation_function_linear, ) best, _ = optimizers.swarm_optimize( env, n_particles=kwargs.pop("n_particles", 5), max_steps=int(steps), **kwargs, ) self._policy = edges, best self._target_residue.parent = target_residue_parent self._source_residue.parent = source_residue_parent for atom in self._target_residue.child_list: atom.parent = self._target_residue orig_coords = orig_coords_target[atom] atom.coord[0] = orig_coords[0] atom.coord[1] = orig_coords[1] atom.coord[2] = orig_coords[2] for atom in self._source_residue.child_list: atom.parent = self._source_residue orig_coords = orig_coords_source[atom] atom.coord[0] = orig_coords[0] atom.coord[1] = orig_coords[1] atom.coord[2] = orig_coords[2] for atom in orig_parents_source: atom.parent = orig_parents_source[atom] orig_coords = orig_coords_source[atom] atom.coord[0] = orig_coords[0] atom.coord[1] = orig_coords[1] atom.coord[2] = orig_coords[2] for atom in orig_parents_target: atom.parent = orig_parents_target[atom] orig_coords = orig_coords_target[atom] atom.coord[0] = orig_coords[0] atom.coord[1] = orig_coords[1] atom.coord[2] = orig_coords[2] # =============================================================== # This works really well, but is slow because of the copying # =============================================================== # tmp.add_residues( # self._target_residue, # self._source_residue, # adjust_seqid=False, # _copy=True, # ) # # we could also add all close-by atoms # r = self._optimize_bystander_radius # for atom in self.target.get_atoms(): # if atom - self._anchors[0] < r or atom - self._anchors[1] < r: # if atom in tmp.get_atoms(): # continue # tmp.add_atoms(atom, _copy=True) # for atom in self.source.get_atoms(): # if atom - self._anchors[0] < r or atom - self._anchors[1] < r: # if atom in tmp.get_atoms(): # continue # tmp.add_atoms(atom, _copy=True) # for b in bonds: # b = (b[0].serial_number, b[1].serial_number) # tmp.add_bond(*b) # tmp.add_bond(self._anchors[0].serial_number, self._anchors[1].serial_number) # graph = tmp.make_residue_graph() # graph.make_detailed(include_outliers=True, include_heteroatoms=True, f=1.2) # edges = sorted(tmp.get_residue_connections()) # env = optimizers.MultiBondRotatron(graph, edges, mask_same_residues=False) # best, _ = optimizers.scipy_optimize(env, int(steps), **kwargs) # self._policy = edges, best # ================================================================================ # self.best.append(best) # f, reward, *_ = env.step(best) # self.rewards.append(reward) # self.env = env
[docs] def merge(self) -> "Molecule.Molecule": """ Merge the source molecule into the target molecule """ self.target.add_residues(*self.source.residues) self.target._bonds.extend(self.source.bonds) self.target._AtomGraph.migrate_bonds(self.source._AtomGraph) self.target.add_bond(self._anchors[0], self._anchors[1]) # self.target.reindex() # v = bb.utils.visual.MoleculeViewer3D(self.target) if self._policy[0] is not None: edges, angles = self._policy for bond, angle in zip(edges, angles): # we used to use full_id which worked overall good. I think we used to have # also a version with serial for a while. Let's do this again... # UPDATE: By now there should be no issues with the atom's being in there # so we don't actually need to use the full_id anymore _bond = bond # v.draw_vector( # "rotation", # bond[0].coord, # 1.1 * (bond[1].coord - bond[0].coord), # color="orange", # ) self.target.rotate_around_bond( *_bond, angle, descendants_only=True, angle_is_degrees=False ) self._policy = None, None # v.draw_edges(self.target.bonds, color="red") # v.show() return self.target
__default_keep_keep_stitcher__ = Stitcher(False, False) """ Default stitcher for the case that both molecules are kept """ # __default_keep_copy_stitcher__ = Stitcher(False, True) # """ # Default stitcher for the case that the target molecule is kept and the source molecule is copied # """ # __default_copy_copy_stitcher__ = Stitcher(True, True) # """ # Default stitcher for the case that both molecules are copied # """
[docs] def stitch( target: "Molecule.Molecule", source: "Molecule.Molecule", recipe: "Linkage.Linkage" = None, target_removals: tuple = None, source_removals: tuple = None, target_atom: Union[int, "base_classes.Atom"] = None, source_atom: Union[int, "base_classes.Atom"] = None, target_residue: Union[int, "base_classes.Residue"] = None, source_residue: Union[int, "base_classes.Residue"] = None, optimization_steps: int = 1e4, copy_target: bool = False, copy_source: bool = False, **kwargs, ) -> "Molecule.Molecule": """ Stitch two molecules together. Thereby the source molecule is integrated into the target molecule. Parameters ---------- target : Molecule The target molecule source : Molecule The source molecule. This will be attached to the target molecule. recipe : AbstractRecipe The recipe to be used for stitching. If none is provided, the stitching instructions can be submitted manually via the other parameters. target_removals : tuple This parameter is only used if no recipe is provided. A tuple of atoms to be removed from the target molecule. These must be the atom's ids within the attaching residue. All atoms must be part fo the same residue as the attaching `target_atom`. source_removals : tuple This parameter is only used if no recipe is provided. A tuple of atoms to be removed from the source molecule. These must be the atom's ids within the attaching residue. All atoms must be part fo the same residue as the attaching `source_atom`. target_atom : int or Atom This parameter is only used if no recipe is provided. The atom on the target molecule to which the source molecule will be attached. This may either be the atom object directly or its serial number. If none is provided, the molecule's "root atom" is used (if defined). source_atom : int or Atom This parameter is only used if no recipe is provided. The atom on the source molecule to which the target molecule will be attached. This may either be the atom object directly or its serial number. If none is provided, the molecule's "root atom" is used (if defined). target_residue : int or Residue The residue hosting the target atom. This is only required if the target atom is not given directly or specified by name. If a recipe is provided, the "attach_residue" is used by default. A ValueError is raised if none is defined. source_residue : int or Residue The residue hosting the source atom. This is only required if the source atom is not given directly or specified by name. If a recipe is provided, the "attach_residue" is used by default. A ValueError is raised if none is defined. optimization_steps : int, optional The number of steps to take in the optimization process, by default 1e4 copy_target : bool, optional Whether to copy the target molecule before stitching, by default False copy_source : bool, optional Whether to copy the source molecule before stitching, by default False **kwargs Additional keyword arguments to pass to the optimizer. See the documentation for `biobuild.optimizers.agents.optimize` for more details. Returns ------- Molecule The stitched molecule """ if recipe is not None: if target_residue is None: if target.attach_residue is None: target_residue = target.residues[-1] # raise ValueError( # "A residue in the target molecule must be provided or the target molecule must define an attachment residue" # ) else: target_residue = target.attach_residue if source_residue is None: if source.attach_residue is None: source_residue = source.residues[-1] # raise ValueError( # "A residue in the source molecule must be provided or the source molecule must define an attachment residue" # ) else: source_residue = source.attach_residue target_atom, source_atom = recipe._stitch_ref_atoms return stitch( target=target, source=source, target_removals=recipe.deletes[0], source_removals=recipe.deletes[1], target_atom=target_atom, source_atom=source_atom, target_residue=target_residue, source_residue=source_residue, optimization_steps=optimization_steps, **kwargs, ) # ===================== if copy_source: source = source.copy() if copy_target: target = target.copy() __default_keep_keep_stitcher__.apply( target=target, source=source, target_removals=target_removals, source_removals=source_removals, target_atom=target_atom, source_atom=source_atom, target_residue=target_residue, source_residue=source_residue, optimization_steps=optimization_steps, **kwargs, ) return __default_keep_keep_stitcher__.merge()
if __name__ == "__main__": import biobuild as bb bb.load_sugars() glc = bb.Molecule.from_compound("GLC") s = Stitcher(False, False) s.apply( glc, glc.copy(), # ["O1", "HO1"], # [ # "HO4", # ], target_atom="C1", source_atom="O4", ) s.merge().show() # ser = bb.Molecule.from_pubchem("SER") # ser.autolabel() # mol3 = bb.Molecule.from_pubchem("tert-butyl acetate") # # mol3.autolabel() # # attach the red bit to the serine # l1 = bb.linkage("C6", "O3", ["H12"], ["HO3"]) # out = stitch(mol3, ser, l1) # out.show() # import biobuild as bb # glc = bb.Molecule.from_compound("GLC") # glc.repeat(4, "14bb") # glc2 = bb.Molecule.from_compound("GLC") # prot = bb.core.Scaffold.from_pdb( # "/Users/noahhk/GIT/biobuild/support/examples/4tvp.prot.pdb" # ) # prot.infer_bonds(restrict_residues=False) # s = Stitcher(True, True) # res = prot.find()[0] # s.apply(prot, glc, target_removals=("")) # # --------------------------------------------- # # This is a very nice code piece down here... # # --------------------------------------------- # # glc2.rotate_around_bond(6, 5, 68) # # glc2.rotate_around_bond(3, 4, 41) # # v = None # # n = 200 # # with alive_progress.alive_bar(n) as bar: # # for i in range(n): # # s = Stitcher(True, True) # # s.apply(glc, glc2, ("O1", "HO1"), ("HO4",), "C1", "O4", 1e4) # # new_glc = s.merge() # # if v is None: # # v = bb.utils.visual.MoleculeViewer3D(new_glc) # # else: # # v.draw_edges(new_glc.bonds, color="red", opacity=0.02) # # bar() # # if v: v.show() # # import pandas as pd # # import seaborn as sns # # import matplotlib.pyplot as plt # # df = pd.DataFrame(s.best, columns=["phi", "psi"]) # # df["reward"] = s.rewards # # ax = sns.jointplot(data=df, x="phi", y="psi", kind="kde") # # sns.scatterplot(data=df, x="phi", y="psi", hue="reward", ax=ax.ax_joint) # # plt.show() # # pass