Source code for buildamol.extensions.bio.proteins.peptides

import buildamol.core as core
import buildamol.resources as resources
import buildamol.structural as structural
import numpy as np
from typing import Union

__all__ = [
    "peptide",
    "phi",
    "psi",
    "omega",
    "sequence",
    "sequence_to_3letter",
    "sequence_to_1letter",
    "amino_acids",
    "amino_acid_names_3letter",
    "amino_acid_names_1letter",
]

_1to3 = {
    "A": "ALA",
    "C": "CYS",
    "D": "ASP",
    "E": "GLU",
    "F": "PHE",
    "G": "GLY",
    "H": "HIS",
    "I": "ILE",
    "K": "LYS",
    "L": "LEU",
    "M": "MET",
    "N": "ASN",
    "P": "PRO",
    "Q": "GLN",
    "R": "ARG",
    "S": "SER",
    "T": "THR",
    "V": "VAL",
    "W": "TRP",
    "Y": "TYR",
}

_3to1 = {v: k for k, v in _1to3.items()}

# for compatibility with older versions
__1to3 = _1to3
__3to1 = _3to1

amino_acid_names_3letter = set(_3to1.keys())
"""
The 3-letter codes of standard amino acids.
This includes only the names of the standard amino acids; does not load any molecule mobjects. 
Use the `amino_acids` object to access the actual molecules.
"""

amino_acid_names_1letter = set(_1to3.keys())
"""
The 1-letter codes of standard amino acids.
This includes only the names of the standard amino acids; does not load any molecule mobjects.
Use the `amino_acids` object to access the actual molecules.
"""


class _amino_acids_generator:
    """
    The standard amino acids obtainable via attribute access.
    """

    def __getattr__(self, name: str) -> core.Molecule:
        if len(name) == 1:
            name = _1to3.get(name.upper(), None)
            if name is None:
                raise ValueError(f"Unknown amino acid: '{name}'")
        elif len(name) == 3:
            name = name.upper()
            if name not in amino_acid_names_3letter:
                raise ValueError(f"Unknown amino acid: '{name}'")
        resources.load_amino_acids()
        mol = resources.get_compound(name)
        if mol is None:
            raise ValueError(f"Unknown amino acid: '{name}'")
        return mol

    def __dir__(self):
        return sorted(amino_acid_names_3letter)

    def __repr__(self):
        return f"amino_acids({', '.join(sorted(amino_acid_names_3letter))})"


amino_acids = _amino_acids_generator()
"""
Access point for standard amino acids. This supports 1-letter, 3-letter codes, and full names.

Example
-------
>>> from buildamol.extensions.bio.proteins.peptides import amino_acids
>>> amino_acids.ALA  # Access by 3-letter code
Molecule(ALA)  
>>> amino_acids.D  # Access by 1-letter code
Molecule(ASP)
>>> amino_acids.proline  # Access by full name
Molecule(PRO)

Each amino acid is a new and unique `Molecule` object!
>>> amino_acids.arginine == amino_acids.arginine
False # Each access returns a new Molecule object
"""


[docs] def peptide(seq: str) -> core.Molecule: """ Create a peptide from a sequence Parameters ---------- seq : str The sequence of the peptide in one-letter code Returns ------- Molecule The peptide """ resources.load_amino_acids() amino_acids = { aa: (resources.get_compound(_1to3[aa]) if aa in _1to3 else None) for aa in seq } for aa in amino_acids: if amino_acids[aa] is None: raise ValueError(f"Unknown amino acid: '{aa}'") mol: core.Molecule = amino_acids[seq[0]].copy() mol.set_linkage("LINK") for aa in seq[1:]: mol.attach(amino_acids[aa], use_patch=False) if mol.count_clashes(): mol.optimize() return mol
[docs] def sequence(mol: core.Molecule, unknown: Union[str, callable] = "X") -> str: """ Get the 1-letter code sequence of a peptide. This also works for proteins with multiple chains. Chains are separated by a colon. Parameters ---------- mol : Molecule The peptide unknown : str or callable, optional The character to use for unknown residues (default: 'X') This can also be set to a function that takes a the molecule and residue object and returns a string. Set to None to ignore unknown residues. Returns ------- str The sequence of the peptide in one-letter code """ if not isinstance(mol, core.Molecule): raise TypeError("Expected a Molecule object") if unknown is not None and not callable(unknown): _unknown = lambda mol, res: unknown elif unknown is None: _unknown = lambda mol, res: "" elif callable(unknown): _unknown = unknown else: raise TypeError(f"Unknown must be a string or a callable, got {type(unknown)}") total_seq = [] for chain in mol.get_chains(): chain_seq = [None] * len(chain.child_list) for r, res in enumerate(chain.child_list): name = _3to1.get(res.name, None) if name is None: name = _unknown(mol, res) chain_seq[r] = name total_seq.append("".join(chain_seq)) return ":".join(total_seq)
[docs] def sequence_to_3letter(seq: str, sep=" ") -> str: """ Convert a sequence in one-letter code to three-letter code. Parameters ---------- seq : str The sequence in one-letter code. Returns ------- str The sequence in three-letter code. sep : str, optional The separator to use between the three-letter codes (default: space) Exampl ------- >>> sequence_to_3letter("ACDE") 'ALA CYS ASP GLU' """ return sep.join(_1to3[aa] if aa in _1to3 else aa for aa in seq)
[docs] def sequence_to_1letter(seq: str, sep=" ") -> str: """ Convert a sequence in three-letter code to one-letter code. Parameters ---------- seq : str The sequence in three-letter code. Returns ------- str The sequence in one-letter code. sep : str, optional The separator to use between the three-letter codes (default: space) Example ------- >>> sequence_to_1letter("ALA CYS ASP GLU") 'ACDE' """ return "".join(_3to1[aa] if aa in _3to1 else aa for aa in seq.split(sep))
[docs] def phi( mol: core.Molecule, res: Union[int, core.Residue] = None ) -> Union[float, np.ndarray]: """ Compute the phi angle of a residue in a protein Parameters ---------- mol : Molecule The protein res : int The residue number of the residue having the alpha carbon. If not provided, all residues are considered. Returns ------- float or ndarray The phi angle(s) in degrees """ if res is None: res = range(1, mol.count_residues() + 1) return np.array([phi(mol, r) for r in res]) res = mol.get_residue(res) if res is None: raise ValueError(f"Residue {res} not found") _prev = mol.get_residue(res.serial_number - 1) if _prev is None: return np.nan # get the atoms N = res.get_atom("N") CA = res.get_atom("CA") C = res.get_atom("C") C_prev = _prev.get_atom("C") return structural.compute_dihedral(C_prev, N, CA, C)
[docs] def psi( mol: core.Molecule, res: Union[int, core.Residue] = None ) -> Union[float, np.ndarray]: """ Compute the psi angle of a residue in a protein Parameters ---------- mol : Molecule The protein res : int The residue number of the residue having the alpha carbon. If not provided, all residues are considered. Returns ------- float or ndarray The psi angle(s) in degrees """ if res is None: res = range(1, mol.count_residues() + 1) return np.array([psi(mol, r) for r in res]) res = mol.get_residue(res) if res is None: raise ValueError(f"Residue {res} not found") _next = mol.get_residue(res.serial_number + 1) if _next is None: return np.nan # get the atoms N = res.get_atom("N") CA = res.get_atom("CA") C = res.get_atom("C") N_next = _next.get_atom("N") return structural.compute_dihedral(N, CA, C, N_next)
[docs] def omega( mol: core.Molecule, res: Union[int, core.Residue] = None ) -> Union[float, np.ndarray]: """ Compute the omega angle of a residue in a protein Parameters ---------- mol : Molecule The protein res : int The residue number of the residue having the carboxyl carbon. If not provided, all residues are considered. Returns ------- float or ndarray The omega angle(s) in degrees """ if res is None: res = range(1, mol.count_residues() + 1) return np.array([omega(mol, r) for r in res]) res = mol.get_residue(res) if res is None: raise ValueError(f"Residue {res} not found") _next = mol.get_residue(res.serial_number + 1) if _next is None: return np.nan # get the atoms CA = res.get_atom("CA") C = res.get_atom("C") N_next = _next.get_atom("N") CA_next = _next.get_atom("CA") return structural.compute_dihedral(CA, C, N_next, CA_next)
if __name__ == "__main__": p = peptide("ACDEFGHIKLMNPQRSTVWY") # p.show() print(sequence(p)) print(amino_acids.ALA)