Source code for biobuild.resources.pubchem

"""
This module contains functions for interacting with PubChem.

Remotely accessing PubChem
--------------------------

The `query` function can be used to query PubChem for a compound. It returns the 2D and 3D
representations of the compound as `pubchempy.Compound` objects. The `pubchempy` package is
used to perform the query. The `query` function takes a query string and a query type as
input. The query type can be one of "name", "cid", "smiles", "inchi", "inchikey".
The returned outputs can be used to create ``Molecule`` objects using a function ``biobuild.Molecule._molecule_from_pubchem`` which is by default integrated
into the ``biobuild.Molecule.Molecule.from_pubchem`` class method.

Converting PubChem data to CIF
------------------------------
PubChem allows data downloads in the form of JSON and SDF files. The JSON file contains the descriptive data of a chemical compound while the SDF contains
the 3D conformer of the compound. The `pubchem_to_cif` function can be used to convert these files into a CIF file which can be used to load the compound
into biobuild using the ``PDBECompounds`` class. The function takes the paths to the JSON and SDF files as input and optionally an ID for the compound
and a path to the CIF file to write. If no ID is specified, the function will try to infer an ID from the JSON file.

.. code-block:: python

    from biobuild.resources import pubchem

    # ... download JSON and SDF files from PubChem for some compound
    json_file = "some_compound.json"
    sdf_file = "some_compound.sdf"

    # convert to CIF
    pubchem.pubchem_to_cif(json_file, sdf_file, id="SOMECOMPOUND", cif_file="some_compound.cif")
"""

import json
import os
import re
import pubchempy as pcp
from tabulate import tabulate

_grammar_dict = {
    1: "st",
    2: "nd",
    3: "rd",
}
"""
A mapping for the first three numbers to their ordinal suffixes.
"""

_sdf_coord_pattern = re.compile(
    r"^ {3,}(-?\d+\.\d*) {3,}(-?\d+\.\d*) {3,}(-?\d+\.\d*) +(\w+) +(\d)"
)
"""
Regex pattern for extracting atom coordinates from SDF file.
The pattern has 5 groups:
    1. x coordinate
    2. y coordinate
    3. z coordinate
    4. atom symbol
    5. charge
"""

_sdf_bond_pattern = re.compile(r"^ +(\d+) +(\d+) +(\d) {2}\d")
"""
Regex pattern for extracting bond information from SDF file.
The pattern has 3 groups:
    1. atom 1
    2. atom 2
    3. bond type
"""

_sdf_bond_type_map = {
    "1": "SING",
    "2": "DOUB",
    "3": "TRIP",
    "4": "DOUB",
}
"""
Mapping of SDF bond types to biobuild bond types.
"""

_sdf_charge_map = {
    "0": 0,
    "3": 1,
    "2": 2,
    "1": 3,
    "5": -1,
    "6": -2,
    "7": -3,
    "4": 0,  # would be radical, but we don't support that
}
"""
Mapping of SDF charge types to proper values.
"""

_cif_template_file = os.path.join(os.path.dirname(__file__), "pdbe_template.cif")
"""
Path to the CIF template file.
"""

_json_headers = ("TOCHeading", "Name", "String")
"""
Id providing headers in the JSON data structure
"""

# =========================================================================== #
#                               Public API                                   #
# =========================================================================== #



[docs]
def query(_query: str, by: str = "name", idx: int = 0):
    """
    Query PubChem for a compound.

    Parameters
    ----------
    _query : str
        The query string
    by : str, optional
        The type of query to perform. One of "name", "cid", "smiles", "inchi", "inchikey".
        Defaults to "name".
    idx : int, optional
        The index of the compound to return if multiple compounds are found. Defaults to 0.
        Only one compound will be returned at a time.

    Returns
    -------
    tuple
        The 2D and 3D representations of the compound
    """
    _2d_comp = pcp.get_compounds(_query, by)
    _3d_comp = pcp.get_compounds(_query, by, record_type="3d")
    if len(_2d_comp) == 0 or len(_3d_comp) == 0:
        raise ValueError(f"Could not find molecule {_query} in PubChem")
    elif len(_2d_comp) > 1 or len(_3d_comp) > 1:
        Warning(
            f"Found multiple molecules for {_query} in PubChem, using the {idx+1}{_grammar_dict.get(idx+1, 'th')} one!"
        )
    return _2d_comp[idx], _3d_comp[idx]




[docs]
def pubchem_to_cif(json_file: str, sdf_file: str, id: str = None, cif_file: str = None):
    """
    Convert a downloaded PubChem data entry to a CIF file. This is useful for preparing
    a PubChem component for use in biobuild - i.e. to relabel atoms as required by the
    CHARMM force field. Simply download the PubChem data for a molecule in form of the JSON
    data and the 3D conformer SDF file and pass the file names to this function to generate
    a CIF file from them which can be loaded into biobuild using the `PDBECompounds` class.

    Parameters
    ----------
    json_file : str
        The path to the JSON file containing the PubChem data of the molecule.
    sdf_file : str
        The path to the SDF file containing the 3D conformer of the molecule.
    id : str, optional
        The ID of the molecule. If not specified, an id will be inferred from the JSON file.
    cif_file : str, optional
        The path to the CIF file to write. If not specified, the CIF file will be written to
        the same directory as the JSON file with the same name (but adjusted suffix).
    """
    atoms, bonds = load_sdf(sdf_file)
    name, data = load_json(json_file)
    if id is None:
        id = make_id(name, data)

    # make the atom and bond tables
    atoms, atom_table = make_atom_table(id, atoms)
    _, bond_table = make_bond_table(id, bonds, atoms)

    # now make the CIF file
    cif = make_cif(
        id=id,
        name=name,
        type="Ligand",
        atoms=atom_table,
        bonds=bond_table,
        create_date=get_create_date(data),
        modify_date=get_modify_date(data),
        formal_charge=get_formal_charge(data),
        mol_weight=get_mol_weight(data),
        formula=get_formula(data),
        synonyms=get_synonyms(id, name, data)[1],
        descriptors=get_descriptors(id, data)[1],
    )

    # now write the CIF file
    if cif_file is None:
        # just in case the json file has a different suffix
        cif_file = json_file.replace(".json", "") + ".cif"
    with open(cif_file, "w") as f:
        f.write(cif)



__all__ = ["query", "pubchem_to_cif"]

# =========================================================================== #
#                               Private API                                  #
# =========================================================================== #


def load_json(json_file):
    """
    Load a PubChem JSON file and return the data as a dictionary.

    Parameters
    ----------
    json_file : str
        The path to the JSON file.

    Returns
    -------
    tuple
        The name of the molecule and the data as a dictionary.
    """
    with open(json_file, "r") as f:
        data = json.load(f)["Record"]
        name = data["RecordTitle"]
        data = data["Section"]

    # now remove the list structures
    data = {i[_json_headers[0]]: i for i in data}
    data = list_to_dict(data)

    return name, data


def load_sdf(sdf_file):
    """
    Load the SDF file and return the data a list of atom data and bond data.

    Parameters
    ----------
    sdf_file : str
        The path to the SDF file.

    Returns
    -------
    tuple
        The atom data and bond data.
    """
    atom_data = []
    bond_data = []
    with open(sdf_file, "r") as f:
        for line in f:
            if _sdf_coord_pattern.match(line):
                m = _sdf_coord_pattern.match(line)
                atom_data.append(m.groups())
            elif _sdf_bond_pattern.match(line):
                m = _sdf_bond_pattern.match(line)
                bond_data.append(m.groups())

    # Convert the data to a better format
    atom_data = [
        (
            idx + 1,  # atom index
            i[-2],  # atom element
            _sdf_charge_map[i[-1]],  # charge
            i[0],
            i[1],
            i[2],  # coordinatex xyz
        )
        for idx, i in enumerate(atom_data)
    ]
    bond_data = [(int(i[0]), int(i[1]), _sdf_bond_type_map[i[2]]) for i in bond_data]
    return atom_data, bond_data


def list_to_dict(_dict):
    """
    Flatten list-substructures into dictionaries.

    For some reason the PubChem JSON data structure contains lists of dictionaries
    where pure dictionaries would be sufficient. This function removes the lists
    and replaces them with dictionaries.
    """
    if isinstance(_dict, list):
        if len(_dict) == 1:
            if isinstance(_dict[0], dict):
                return list_to_dict(_dict[0])
            return _dict[0]
        id_header = match_header(_dict)
        if id_header is None:
            return _dict
        new = {i[id_header]: i for i in _dict}
        new = list_to_dict(new)
        return new
    elif isinstance(_dict, dict):
        for k in _dict:
            _dict[k] = list_to_dict(_dict[k])
    return _dict


def successive_get(_dict, *keys):
    """
    Get successively deeper keys from a dictionary.

    Parameters
    ----------
    _dict : dict
        The dictionary to get the keys from.
    *keys
        The keys to get from the dictionary.

    Returns
    -------
    object
        The object at the end of the key chain.
        If any of the keys is not found, None is returned.
    """
    for key in keys:
        _dict = _dict.get(key, None)
        if _dict is None:
            return None
    return _dict


def bracketize(s):
    """
    Ensure that a string is bracketized if it contains any special characters.
    """
    if any(i in s for i in " /:[](),@=+"):
        return f'"{s}"'
    return s


def match_header(_list):
    """
    Find an ID header in a list of dictionaries.
    """
    for id_header in _json_headers:
        for j in _list:
            if isinstance(j, dict) and id_header in j:
                return id_header


def make_cif(
    id: str,
    name: str,
    type: str,
    create_date: str,
    modify_date: str,
    mol_weight: str,
    formula: str,
    formal_charge: str,
    atoms: str,
    bonds: str,
    synonyms: str,
    descriptors: str,
) -> str:
    with open(_cif_template_file, "r") as f:
        cif = f.read()
    cif = cif.replace("{ID}", id)
    cif = cif.replace("{FULLNAME}", name)
    cif = cif.replace("{TYPE}", type)
    cif = cif.replace("{CREATE_DATE}", create_date)
    cif = cif.replace("{MODIFY_DATE}", modify_date)
    cif = cif.replace("{MOL_WEIGHT}", mol_weight)
    cif = cif.replace("{FORMULA}", formula)
    cif = cif.replace("{FORMAL_CHARGE}", formal_charge)
    cif = cif.replace("{ATOMS_TABLE}", atoms)
    cif = cif.replace("{BONDS_TABLE}", bonds)
    cif = cif.replace("{SYNONYMS_TABLE}", synonyms)
    cif = cif.replace("{DESCRIPTIONS_TABLE}", descriptors)
    return cif


def make_id(name, _dict):
    """
    Make an ID for the molecule from the PubChem data.

    Parameters
    ----------
    name : str
        The name of the molecule.

    _dict : dict
        The PubChem data.

    Returns
    -------
    str
        The ID.
    """
    id = get_iupac_condensed_id(_dict)
    if id is None:
        _names = get_synonyms(_dict)
        if len(_names) > 0:
            id = sorted(_names, key=len)[0]
    if id is None:
        id = name.replace(" ", "_").replace("-", "")[:3]
    else:
        id = id.replace(" ", "").replace("-", "")
    return id


def get_iupac_condensed_id(_dict):
    """
    Get the IUPAC condensed ID from the PubChem data.
    If it is available.
    """
    id = successive_get(
        _dict,
        "Biologic Description",
        "Section",
        "Biologic Line Notation",
        "Information",
        "IUPAC Condensed",
        "Value",
        "StringWithMarkup",
        "String",
    )
    return id


def get_synonyms(id, name, _dict):
    """
    Generate a list of synonyms of the molecule from the PubChem data.

    Parameters
    ----------
    id : str
        The ID of the molecule.
    name : str
        The name of the molecule.
    _dict : dict
        The PubChem data dictionary.

    Returns
    -------
    list
        A list of synonyms.
    str
        A cif-formatted table of the synonyms.
    """
    _names = _get_synonyms(_dict)
    _names[name] = name
    iupac_id = get_iupac_condensed_id(_dict)
    if iupac_id:
        _names[iupac_id] = iupac_id
    _synonyms = []
    for idx, name in enumerate(_names):
        _synonyms.append(
            (
                idx + 1,  # ordinal
                id,  # comp_id
                bracketize(name),  # name
            )
        )
    return _synonyms, tabulate(_synonyms, tablefmt="plain")


def make_atom_table(id, atoms):
    """
    Reformat the SDF atom data into a CIF-ready atom table.

    Parameters
    ----------
    id : str
        The ID of the molecule.
    atoms : list
        The SDF-parsed atom data.

    Returns
    -------
    list
        A reformatted list of atoms (required for `make_bond_table`).
    str
        A cif-formatted table of the atoms.
    """
    _atoms = []
    _atom_counts = {}
    for adx, element, charge, x, y, z in atoms:
        if element not in _atom_counts:
            _atom_counts[element] = 0
        _atom_counts[element] += 1
        a_id = f"{element}{_atom_counts[element]}"
        _atoms.append(
            (
                id,  # comp_id
                a_id,  # atom_id
                a_id,  # alt_atom_id
                element,  # type_symbol
                charge,  # charge
                x,  # x
                y,  # y
                z,  # z
                a_id,  # component_atom_id
                id,  # component_comp_id
                adx,  # ordinal
            )
        )
    return _atoms, tabulate(_atoms, tablefmt="plain")


def make_bond_table(id, bonds, atom_table):
    """
    Reformat the SDF bond data into a CIF-ready bond table.

    Parameters
    ----------
    id : str
        The ID of the molecule.
    bonds : list
        The SDF-parsed bond data.
    atom_table : list
        The atom table generated by `make_atom_table`.

    Returns
    -------
    list
        A reformatted bond list (not required by any other functions).
    str
        A cif-formatted table of the bonds.
    """
    _bonds = []
    for bdx, (a1, a2, bond_type) in enumerate(bonds):
        _bonds.append(
            (
                id,  # comp_id
                atom_table[a1 - 1][1],  # atom_id_1
                atom_table[a2 - 1][1],  # atom_id_2
                bond_type,  # value_order
                bdx + 1,  # ordinal
            )
        )
    return _bonds, tabulate(_bonds, tablefmt="plain")


def get_create_date(_dict):
    """
    Get the create date from the PubChem data.
    """
    date = successive_get(
        _dict, "Names and Identifiers", "Section", "Create Date", "Information", "Value"
    )
    if date:
        date = list(date.values())[0]
    else:
        date = "0000-00-00"
    return date


def get_modify_date(_dict):
    """
    Get the modify date from the PubChem data.
    """
    date = successive_get(
        _dict, "Names and Identifiers", "Section", "Modify Date", "Information", "Value"
    )
    if date:
        date = list(date.values())[0]
    else:
        date = get_create_date(_dict)
    return date


def get_mol_weight(_dict):
    """
    Get the molecular weight from the PubChem data.
    """
    weight = successive_get(
        _dict,
        "Chemical and Physical Properties",
        "Section",
        "Section",
        "Molecular Weight",
        "Information",
        "Value",
        "StringWithMarkup",
        "String",
    )
    if not weight:
        weight = "0.000"
    return weight


def get_formal_charge(_dict):
    """
    Get the formal charge from the PubChem data.
    """
    charge = successive_get(
        _dict,
        "Chemical and Physical Properties",
        "Section",
        "Section",
        "Formal Charge",
        "Information",
        "Value",
        "Number",
    )
    if not charge:
        charge = "0"
    return str(charge)


def get_formula(_dict):
    """
    Get the molecular formula for the compound.
    """
    formula = successive_get(
        _dict,
        "Names and Identifiers",
        "Section",
        "Molecular Formula",
        "Information",
        "Value",
        "StringWithMarkup",
        "String",
    )
    return formula


def get_descriptors(id, _dict):
    """
    Generate a list of descriptors of the molecule from the PubChem data.

    Parameters
    ----------
    id : str
        The ID of the molecule.
    _dict : dict
        The PubChem data dictionary.

    Returns
    -------
    list
        A list of descriptors.
    str
        A cif-formatted table of the descriptors.
    """
    _descriptors = _get_descriptors(_dict)
    _descriptions = []
    for key, value in _descriptors.items():
        _descriptions.append(
            (
                id,  # comp_id
                bracketize(key),  # type
                bracketize(value),  # descriptor
            )
        )
    return _descriptions, tabulate(_descriptions, tablefmt="plain")


def _get_synonyms(_dict):
    """
    Get a dictionary of names for the compound.
    """
    _names = successive_get(
        _dict,
        "Names and Identifiers",
        "Section",
        "Synonyms",
        "Section",
        "Information",
        "Value",
        "StringWithMarkup",
    )
    return _names


def _get_descriptors(_dict):
    """
    Get a dictionary of descriptors for the compound.
    """
    descriptors = successive_get(
        _dict, "Names and Identifiers", "Section", "Computed Descriptors", "Section"
    )
    _descriptors = {
        i: successive_get(j, "Information", "Value", "StringWithMarkup", "String")
        for i, j in descriptors.items()
    }
    return _descriptors


if __name__ == "__main__":
    pubchem_to_cif(
        "/Users/noahhk/Downloads/COMPOUND_CID_126961780-2.json",
        "/Users/noahhk/Downloads/Conformer3D_COMPOUND_CID_126961780-2.sdf",
        "Epi4",
        "Epi4.cif",
    )