Source code for biobuild.resources.pubchem

"""
This module contains functions for interacting with PubChem.

Remotely accessing PubChem
--------------------------

The `query` function can be used to query PubChem for a compound. It returns the 2D and 3D
representations of the compound as `pubchempy.Compound` objects. The `pubchempy` package is
used to perform the query. The `query` function takes a query string and a query type as
input. The query type can be one of "name", "cid", "smiles", "inchi", "inchikey".
The returned outputs can be used to create ``Molecule`` objects using a function ``biobuild.Molecule._molecule_from_pubchem`` which is by default integrated
into the ``biobuild.Molecule.Molecule.from_pubchem`` class method.

Converting PubChem data to CIF
------------------------------
PubChem allows data downloads in the form of JSON and SDF files. The JSON file contains the descriptive data of a chemical compound while the SDF contains
the 3D conformer of the compound. The `pubchem_to_cif` function can be used to convert these files into a CIF file which can be used to load the compound
into biobuild using the ``PDBECompounds`` class. The function takes the paths to the JSON and SDF files as input and optionally an ID for the compound
and a path to the CIF file to write. If no ID is specified, the function will try to infer an ID from the JSON file.

.. code-block:: python

    from biobuild.resources import pubchem

    # ... download JSON and SDF files from PubChem for some compound
    json_file = "some_compound.json"
    sdf_file = "some_compound.sdf"

    # convert to CIF
    pubchem.pubchem_to_cif(json_file, sdf_file, id="SOMECOMPOUND", cif_file="some_compound.cif")
"""

import json
import os
import re
import pubchempy as pcp
from tabulate import tabulate

_grammar_dict = {
    1: "st",
    2: "nd",
    3: "rd",
}
"""
A mapping for the first three numbers to their ordinal suffixes.
"""

_sdf_coord_pattern = re.compile(
    r"^ {3,}(-?\d+\.\d*) {3,}(-?\d+\.\d*) {3,}(-?\d+\.\d*) +(\w+) +(\d)"
)
"""
Regex pattern for extracting atom coordinates from SDF file.
The pattern has 5 groups:
    1. x coordinate
    2. y coordinate
    3. z coordinate
    4. atom symbol
    5. charge
"""

_sdf_bond_pattern = re.compile(r"^ +(\d+) +(\d+) +(\d) {2}\d")
"""
Regex pattern for extracting bond information from SDF file.
The pattern has 3 groups:
    1. atom 1
    2. atom 2
    3. bond type
"""

_sdf_bond_type_map = {
    "1": "SING",
    "2": "DOUB",
    "3": "TRIP",
    "4": "DOUB",
}
"""
Mapping of SDF bond types to biobuild bond types.
"""

_sdf_charge_map = {
    "0": 0,
    "3": 1,
    "2": 2,
    "1": 3,
    "5": -1,
    "6": -2,
    "7": -3,
    "4": 0,  # would be radical, but we don't support that
}
"""
Mapping of SDF charge types to proper values.
"""

_cif_template_file = os.path.join(os.path.dirname(__file__), "pdbe_template.cif")
"""
Path to the CIF template file.
"""

_json_headers = ("TOCHeading", "Name", "String")
"""
Id providing headers in the JSON data structure
"""

# =========================================================================== #
#                               Public API                                   #
# =========================================================================== #


[docs] def query(_query: str, by: str = "name", idx: int = 0): """ Query PubChem for a compound. Parameters ---------- _query : str The query string by : str, optional The type of query to perform. One of "name", "cid", "smiles", "inchi", "inchikey". Defaults to "name". idx : int, optional The index of the compound to return if multiple compounds are found. Defaults to 0. Only one compound will be returned at a time. Returns ------- tuple The 2D and 3D representations of the compound """ _2d_comp = pcp.get_compounds(_query, by) _3d_comp = pcp.get_compounds(_query, by, record_type="3d") if len(_2d_comp) == 0 or len(_3d_comp) == 0: raise ValueError(f"Could not find molecule {_query} in PubChem") elif len(_2d_comp) > 1 or len(_3d_comp) > 1: Warning( f"Found multiple molecules for {_query} in PubChem, using the {idx+1}{_grammar_dict.get(idx+1, 'th')} one!" ) return _2d_comp[idx], _3d_comp[idx]
[docs] def pubchem_to_cif(json_file: str, sdf_file: str, id: str = None, cif_file: str = None): """ Convert a downloaded PubChem data entry to a CIF file. This is useful for preparing a PubChem component for use in biobuild - i.e. to relabel atoms as required by the CHARMM force field. Simply download the PubChem data for a molecule in form of the JSON data and the 3D conformer SDF file and pass the file names to this function to generate a CIF file from them which can be loaded into biobuild using the `PDBECompounds` class. Parameters ---------- json_file : str The path to the JSON file containing the PubChem data of the molecule. sdf_file : str The path to the SDF file containing the 3D conformer of the molecule. id : str, optional The ID of the molecule. If not specified, an id will be inferred from the JSON file. cif_file : str, optional The path to the CIF file to write. If not specified, the CIF file will be written to the same directory as the JSON file with the same name (but adjusted suffix). """ atoms, bonds = load_sdf(sdf_file) name, data = load_json(json_file) if id is None: id = make_id(name, data) # make the atom and bond tables atoms, atom_table = make_atom_table(id, atoms) _, bond_table = make_bond_table(id, bonds, atoms) # now make the CIF file cif = make_cif( id=id, name=name, type="Ligand", atoms=atom_table, bonds=bond_table, create_date=get_create_date(data), modify_date=get_modify_date(data), formal_charge=get_formal_charge(data), mol_weight=get_mol_weight(data), formula=get_formula(data), synonyms=get_synonyms(id, name, data)[1], descriptors=get_descriptors(id, data)[1], ) # now write the CIF file if cif_file is None: # just in case the json file has a different suffix cif_file = json_file.replace(".json", "") + ".cif" with open(cif_file, "w") as f: f.write(cif)
__all__ = ["query", "pubchem_to_cif"] # =========================================================================== # # Private API # # =========================================================================== # def load_json(json_file): """ Load a PubChem JSON file and return the data as a dictionary. Parameters ---------- json_file : str The path to the JSON file. Returns ------- tuple The name of the molecule and the data as a dictionary. """ with open(json_file, "r") as f: data = json.load(f)["Record"] name = data["RecordTitle"] data = data["Section"] # now remove the list structures data = {i[_json_headers[0]]: i for i in data} data = list_to_dict(data) return name, data def load_sdf(sdf_file): """ Load the SDF file and return the data a list of atom data and bond data. Parameters ---------- sdf_file : str The path to the SDF file. Returns ------- tuple The atom data and bond data. """ atom_data = [] bond_data = [] with open(sdf_file, "r") as f: for line in f: if _sdf_coord_pattern.match(line): m = _sdf_coord_pattern.match(line) atom_data.append(m.groups()) elif _sdf_bond_pattern.match(line): m = _sdf_bond_pattern.match(line) bond_data.append(m.groups()) # Convert the data to a better format atom_data = [ ( idx + 1, # atom index i[-2], # atom element _sdf_charge_map[i[-1]], # charge i[0], i[1], i[2], # coordinatex xyz ) for idx, i in enumerate(atom_data) ] bond_data = [(int(i[0]), int(i[1]), _sdf_bond_type_map[i[2]]) for i in bond_data] return atom_data, bond_data def list_to_dict(_dict): """ Flatten list-substructures into dictionaries. For some reason the PubChem JSON data structure contains lists of dictionaries where pure dictionaries would be sufficient. This function removes the lists and replaces them with dictionaries. """ if isinstance(_dict, list): if len(_dict) == 1: if isinstance(_dict[0], dict): return list_to_dict(_dict[0]) return _dict[0] id_header = match_header(_dict) if id_header is None: return _dict new = {i[id_header]: i for i in _dict} new = list_to_dict(new) return new elif isinstance(_dict, dict): for k in _dict: _dict[k] = list_to_dict(_dict[k]) return _dict def successive_get(_dict, *keys): """ Get successively deeper keys from a dictionary. Parameters ---------- _dict : dict The dictionary to get the keys from. *keys The keys to get from the dictionary. Returns ------- object The object at the end of the key chain. If any of the keys is not found, None is returned. """ for key in keys: _dict = _dict.get(key, None) if _dict is None: return None return _dict def bracketize(s): """ Ensure that a string is bracketized if it contains any special characters. """ if any(i in s for i in " /:[](),@=+"): return f'"{s}"' return s def match_header(_list): """ Find an ID header in a list of dictionaries. """ for id_header in _json_headers: for j in _list: if isinstance(j, dict) and id_header in j: return id_header def make_cif( id: str, name: str, type: str, create_date: str, modify_date: str, mol_weight: str, formula: str, formal_charge: str, atoms: str, bonds: str, synonyms: str, descriptors: str, ) -> str: with open(_cif_template_file, "r") as f: cif = f.read() cif = cif.replace("{ID}", id) cif = cif.replace("{FULLNAME}", name) cif = cif.replace("{TYPE}", type) cif = cif.replace("{CREATE_DATE}", create_date) cif = cif.replace("{MODIFY_DATE}", modify_date) cif = cif.replace("{MOL_WEIGHT}", mol_weight) cif = cif.replace("{FORMULA}", formula) cif = cif.replace("{FORMAL_CHARGE}", formal_charge) cif = cif.replace("{ATOMS_TABLE}", atoms) cif = cif.replace("{BONDS_TABLE}", bonds) cif = cif.replace("{SYNONYMS_TABLE}", synonyms) cif = cif.replace("{DESCRIPTIONS_TABLE}", descriptors) return cif def make_id(name, _dict): """ Make an ID for the molecule from the PubChem data. Parameters ---------- name : str The name of the molecule. _dict : dict The PubChem data. Returns ------- str The ID. """ id = get_iupac_condensed_id(_dict) if id is None: _names = get_synonyms(_dict) if len(_names) > 0: id = sorted(_names, key=len)[0] if id is None: id = name.replace(" ", "_").replace("-", "")[:3] else: id = id.replace(" ", "").replace("-", "") return id def get_iupac_condensed_id(_dict): """ Get the IUPAC condensed ID from the PubChem data. If it is available. """ id = successive_get( _dict, "Biologic Description", "Section", "Biologic Line Notation", "Information", "IUPAC Condensed", "Value", "StringWithMarkup", "String", ) return id def get_synonyms(id, name, _dict): """ Generate a list of synonyms of the molecule from the PubChem data. Parameters ---------- id : str The ID of the molecule. name : str The name of the molecule. _dict : dict The PubChem data dictionary. Returns ------- list A list of synonyms. str A cif-formatted table of the synonyms. """ _names = _get_synonyms(_dict) _names[name] = name iupac_id = get_iupac_condensed_id(_dict) if iupac_id: _names[iupac_id] = iupac_id _synonyms = [] for idx, name in enumerate(_names): _synonyms.append( ( idx + 1, # ordinal id, # comp_id bracketize(name), # name ) ) return _synonyms, tabulate(_synonyms, tablefmt="plain") def make_atom_table(id, atoms): """ Reformat the SDF atom data into a CIF-ready atom table. Parameters ---------- id : str The ID of the molecule. atoms : list The SDF-parsed atom data. Returns ------- list A reformatted list of atoms (required for `make_bond_table`). str A cif-formatted table of the atoms. """ _atoms = [] _atom_counts = {} for adx, element, charge, x, y, z in atoms: if element not in _atom_counts: _atom_counts[element] = 0 _atom_counts[element] += 1 a_id = f"{element}{_atom_counts[element]}" _atoms.append( ( id, # comp_id a_id, # atom_id a_id, # alt_atom_id element, # type_symbol charge, # charge x, # x y, # y z, # z a_id, # component_atom_id id, # component_comp_id adx, # ordinal ) ) return _atoms, tabulate(_atoms, tablefmt="plain") def make_bond_table(id, bonds, atom_table): """ Reformat the SDF bond data into a CIF-ready bond table. Parameters ---------- id : str The ID of the molecule. bonds : list The SDF-parsed bond data. atom_table : list The atom table generated by `make_atom_table`. Returns ------- list A reformatted bond list (not required by any other functions). str A cif-formatted table of the bonds. """ _bonds = [] for bdx, (a1, a2, bond_type) in enumerate(bonds): _bonds.append( ( id, # comp_id atom_table[a1 - 1][1], # atom_id_1 atom_table[a2 - 1][1], # atom_id_2 bond_type, # value_order bdx + 1, # ordinal ) ) return _bonds, tabulate(_bonds, tablefmt="plain") def get_create_date(_dict): """ Get the create date from the PubChem data. """ date = successive_get( _dict, "Names and Identifiers", "Section", "Create Date", "Information", "Value" ) if date: date = list(date.values())[0] else: date = "0000-00-00" return date def get_modify_date(_dict): """ Get the modify date from the PubChem data. """ date = successive_get( _dict, "Names and Identifiers", "Section", "Modify Date", "Information", "Value" ) if date: date = list(date.values())[0] else: date = get_create_date(_dict) return date def get_mol_weight(_dict): """ Get the molecular weight from the PubChem data. """ weight = successive_get( _dict, "Chemical and Physical Properties", "Section", "Section", "Molecular Weight", "Information", "Value", "StringWithMarkup", "String", ) if not weight: weight = "0.000" return weight def get_formal_charge(_dict): """ Get the formal charge from the PubChem data. """ charge = successive_get( _dict, "Chemical and Physical Properties", "Section", "Section", "Formal Charge", "Information", "Value", "Number", ) if not charge: charge = "0" return str(charge) def get_formula(_dict): """ Get the molecular formula for the compound. """ formula = successive_get( _dict, "Names and Identifiers", "Section", "Molecular Formula", "Information", "Value", "StringWithMarkup", "String", ) return formula def get_descriptors(id, _dict): """ Generate a list of descriptors of the molecule from the PubChem data. Parameters ---------- id : str The ID of the molecule. _dict : dict The PubChem data dictionary. Returns ------- list A list of descriptors. str A cif-formatted table of the descriptors. """ _descriptors = _get_descriptors(_dict) _descriptions = [] for key, value in _descriptors.items(): _descriptions.append( ( id, # comp_id bracketize(key), # type bracketize(value), # descriptor ) ) return _descriptions, tabulate(_descriptions, tablefmt="plain") def _get_synonyms(_dict): """ Get a dictionary of names for the compound. """ _names = successive_get( _dict, "Names and Identifiers", "Section", "Synonyms", "Section", "Information", "Value", "StringWithMarkup", ) return _names def _get_descriptors(_dict): """ Get a dictionary of descriptors for the compound. """ descriptors = successive_get( _dict, "Names and Identifiers", "Section", "Computed Descriptors", "Section" ) _descriptors = { i: successive_get(j, "Information", "Value", "StringWithMarkup", "String") for i, j in descriptors.items() } return _descriptors if __name__ == "__main__": pubchem_to_cif( "/Users/noahhk/Downloads/COMPOUND_CID_126961780-2.json", "/Users/noahhk/Downloads/Conformer3D_COMPOUND_CID_126961780-2.sdf", "Epi4", "Epi4.cif", )