"""
The `Assembler` class is a class that can be to assemble molecules from a library of fragments.
It requires a list of Molecules that serve as the fragments to be assembled. The class will then
generate random molecules by randomly selecting fragments from the library and attaching them to each other.
Usage
-----
1. Create a list of fragments to be used for assembly
2. Create an instance of the Assembler class with the list of fragments
3. Use the `sample` method to generate random molecules or the `make` method to create a specific fragment from an instruction matrix
Example
-------
Let's make a little toy example
.. code-block:: python
import buildamol as bam
from buildamol.extensions.molecular_factories import Assembler
import matplotlib.pyplot as plt
# get some molecules to serve as fragments
fragments = [
bam.Molecule.from_smiles("C1=CC=CC=C1", id="A").autolabel(),
bam.Molecule.from_smiles("CC=O", id="B").autolabel(),
bam.Molecule.from_smiles("COC=C", id="C").autolabel(),
bam.Molecule.from_smiles("C1=CCC=C1", id="D").autolabel(),
bam.Molecule.from_smiles("C(C)N", id="E").autolabel(),
]
# make the assembler
assembler = Assembler(fragments)
# generate some molecules from 3 fragments each
# let's make 9 molecules
molecules = assembler.sample(n_fragments=3, n=9)
fig, axs = plt.subplots(3, 3, figsize=(12, 12))
for mol, ax in zip(molecules, axs.flat):
ax.imshow(
mol.draw2d().draw(),
)
ax.axis("off")
plt.show()
.. image:: examples/files/assembler_example1.png
Making Molecules from Arrays
----------------------------
We can also use the `make` method to create a specific molecule from an instruction matrix
This matrix is a 2D numpy array where each row corresponds to an instruction for attaching the next
fragment onto the molecule. The columns are as follows:
.. code-block::
[
[incoming_fragment_global_index, incoming_atom_index, target_fragment_atom],
[incoming_fragment_global_index, incoming_atom_index, target_fragment_atom]
...
]
The `incoming_fragment_global_index` is the index of the fragment in the fragment library (i.e. in the list).
The `incoming_atom_index` is the index of the atom in the incoming fragment that will be attached to the target fragment (i.e. the attachment point).
The `target_fragment_atom` is the index of the atom in the target fragment that will be attached to the incoming fragment.
Let's make a molecule from an instruction matrix. Let's take the fourth fragment molecule as a start. Then attach the second fragment molecule to it, by attaching the its second atom to the first atom of already present molecule.
Then attach again the fourth fragment onto the molecule by attaching its first atom to the first atom of the second fragment in the molecule.
.. code-block:: python
matrix = np.array([
[3, 0, 0],
[1, 1, 0],
[3, 0, 0],
])
mol = assembler.make(matrix)
mol.draw2d().show()
.. image:: examples/files/assembler_example2.jpg
If including this into an automatic pipeline or an optimization loop it is recommended to wrap the whole thing into a try-except block to catch any errors that might occur due to invalid matrices.
The clue is that the atoms used for attachment should not be used more than once in the matrix. If they are used more than once, the molecule will not be able to be assembled leading to an error.
"""
import numpy as np
from buildamol.core import linkage, Molecule
[docs]
class Assembler:
"""
The Assembler class is a class that can be to assemble molecules from a library of fragments.
Each molecule is a linear chain of fragments that are attached to each other.
Parameters
----------
fragments : list
A list of Molecules that serve as the fragments to be assembled.
"""
def __init__(self, fragments: list):
# we need to maintain a per-fragment database of possible atom-sites where another fragment can be attached
# we also need to maintain a per-fragment database of atom-ids to make linkages
attachment_points = []
atom_ids = []
# let's browse through all fragments and identify the attachment points
# also, filter out any fragments without any attachment points (good practice)
to_drop = []
for fdx, fragment in enumerate(fragments):
# we define all non-Hydrogen atoms as potential attachment points
# but only those that have a hydrogen neighbor that can be removed
# will be considered as attachment points
# n_atoms = sum(1 for i in fragment.get_atoms() if i.element != "H")
a = []
for adx, atom in enumerate(fragment.get_atoms()):
if atom.element == "H":
continue
if fragment.get_hydrogen(atom):
a.append(adx)
if len(a) == 0:
to_drop.append(fdx)
continue
attachment_points.append(a)
atom_ids.append([atom.id for atom in fragment.get_atoms()])
for fragment in to_drop:
del fragments[fragment]
self.fragments = fragments
self.attachment_points = attachment_points
self.atom_ids = atom_ids
[docs]
def specify_attachment_points(self, fragment_or_index, points: list):
"""
Specify the attachment points for a fragment
Parameters
----------
fragment_or_index : int or Molecule
The fragment for which to specify the attachment points
points : list
The attachment points to specify. These must be the indices of the atoms in the fragment as they appear in `fragment.get_atoms()` (NOT the `serial_number`!).
"""
if isinstance(fragment_or_index, int):
self.attachment_points[fragment_or_index] = points
else:
idx = self.fragments.index(fragment_or_index)
self.attachment_points[idx] = points
[docs]
def sample(self, n_fragments: int, n: int = 1):
"""
Generate n random molecules from the fragment library
Parameters
----------
n_fragments : int
The number of fragments to use for each molecule
n : int
The number of molecules to generate
Yields
------
Molecule
A molecule assembled from the fragments
"""
for _ in range(n):
matrix = self.random(n_fragments)
yield self.make(matrix)
[docs]
def make(self, matrix: np.ndarray) -> Molecule:
"""
Assemble a molecule based on an instruction matrix
Parameters
----------
matrix : np.ndarray
The matrix encoding for the molecule
Returns
-------
Molecule
The assembled molecule
"""
_used_atoms = {i: set() for i in range(len(matrix))}
# we start by copying the first fragment
mol = self.fragments[matrix[0, 0]].copy()
# we then attach all other fragments
for i in range(1, len(matrix)):
source, source_atom, target_atom = matrix[i, :]
target = i - 1
# sanity checking to ensure we are not trying to attach to the same atom twice
if target_atom in _used_atoms[target]:
raise ValueError("Target atom already used")
if source_atom in _used_atoms[i]:
raise ValueError("Source atom already used")
# make a linkage and attach the fragment
link = linkage(
self.atom_ids[matrix[target, 0]][target_atom],
self.atom_ids[source][source_atom],
)
mol.attach(self.fragments[source], link, at_residue=int(target + 1))
_used_atoms[target].add(target_atom)
_used_atoms[i].add(source_atom)
return mol
[docs]
def random(self, n_fragments: int) -> np.ndarray:
"""
Make a random matrix encoding for a molecule assembled from fragments.
Parameters
----------
n_fragments : int
The number of fragments to use for the molecule
Returns
-------
np.ndarray
A matrix encoding for the molecule
"""
# we could literally just use a single line here of np.random here, but then we run the risk of
# making invalid matrices where attachment_points are referenced more than once so we make a more intricate
# method here to ensure our "random" matrices are valid
matrix = np.full((n_fragments, 3), -1, dtype=int)
matrix[0, 0] = np.random.choice(len(self.fragments))
# we maintain a chache to keep track over which attachment points have been used already
# on which fragments
_used_atoms = {i: set() for i in range(n_fragments)}
for i in range(1, n_fragments):
# choose an incoming fragment from the database
matrix[i, 0] = np.random.choice(len(self.fragments))
# choose an attachment point on the incoming fragment that was not used already
while matrix[i, 1] == -1:
atom = np.random.choice(self.attachment_points[matrix[i, 0]])
if atom not in _used_atoms[i]:
matrix[i, 1] = atom
_used_atoms[i].add(atom)
# choose a target fragment in the molecule
# and choose an attachment point in the target that was not used already
target = i - 1
while matrix[i, 2] == -1:
available = [
i
for i in self.attachment_points[matrix[target, 0]]
if i not in _used_atoms[target]
]
if len(available) > 0:
matrix[i, 2] = np.random.choice(available)
_used_atoms[target].add(matrix[i, 2])
return matrix
if __name__ == "__main__":
import buildamol as bam
import matplotlib.pyplot as plt
fragments = [
bam.Molecule.from_smiles("C1=CC=CC=C1", id="A").autolabel(),
bam.Molecule.from_smiles("CC=O", id="B").autolabel(),
bam.Molecule.from_smiles("COC=C", id="C").autolabel(),
bam.Molecule.from_smiles("C1=CCC=C1", id="D").autolabel(),
bam.Molecule.from_smiles("C(C)N", id="E").autolabel(),
]
assembler = Assembler(fragments)
matrix = np.array(
[
[3, 0, 0],
[1, 1, 0],
[3, 0, 0],
]
)
mol = assembler.make(matrix)
mol.draw2d().show()
fig, axs = plt.subplots(3, 3, figsize=(12, 12))
for mol, ax in zip(assembler.sample(3, 9), axs.flat):
ax.imshow(
mol.draw2d().draw(),
)
ax.axis("off")
plt.show()