Source code for buildamol.extensions.bio.glycans.iupac

"""
Functions to work with the IUPAC glycan nomenclature.
"""


[docs] class IUPACParser: """ A parser for condensed IUPAC glycan nomenclature strings. This class will generate a list of connecting glycan segments from a string from which a Molecule can be built. """ def __init__(self): self._string = "" self.reset() @property def _current(self): return self._string[-self._idx] @property def _next(self): return self._string[-self._idx - 1] @property def _can_shift(self): return self._idx < len(self._string) @property def _is_at_bracket(self): return self._current in ("(", ")") @property def _is_at_open_square_bracket(self): return self._current == "]" @property def _is_at_close_square_bracket(self): return self._current == "[" @property def _can_store(self): return ( not self._is_still_parsing and len(self._latest_residue) >= 1 and len(self._latest_linkage) >= 3 and len(self._second_latest_residue) >= 1 ) @property def _is_still_parsing(self): if not self._can_shift: return False return self._current not in ("(", ")", "[", "]")
[docs] def parse(self, string): """ Parse a string of IUPAC glycan nomenclature into a list of glycan segments. Parameters ---------- string : str The IUPAC glycan nomenclature string. Returns ------- list A list of tuples where each segment is a tuple of (residue1, residue2, linkage). """ self._string = self._prep_greek_letters(string) self.reset() self._parse() self._glycan = self._adjust_betas(self._glycan) return self._glycan
[docs] def reset(self): """ Reset the parser. """ self._glycan = [] self._idx = 1 self._residue_counts = {} self._past_residue_before_square_bracket = [] self._past_conformations_before_square_bracket = [] self._latest_residue = "" self._latest_conformation = "" self._latest_linkage = "" self._second_latest_residue = "" self._second_latest_conformation = ""
# self._latest_residue_before_square_bracket = "" # self._latest_conformation_before_square_bracket = "" @property def _latest_residue_before_square_bracket(self): return self._past_residue_before_square_bracket[-1] @property def _latest_conformation_before_square_bracket(self): return self._past_conformations_before_square_bracket[-1] def _push_residue_before_square_bracket(self, residue): self._past_residue_before_square_bracket.append(residue) def _push_conformation_before_square_bracket(self, conformation): self._past_conformations_before_square_bracket.append(conformation) def _pop_residue_before_square_bracket(self): return self._past_residue_before_square_bracket.pop() def _pop_conformation_before_square_bracket(self): return self._past_conformations_before_square_bracket.pop() def _shift(self): self._idx += 1 def _shift_residue(self): self._second_latest_residue = self._latest_residue self._latest_residue = "" self._second_latest_conformation = self._latest_conformation self._latest_conformation = "" def _parse(self): self._crop_end() while self._can_shift: if self._can_store: self._store() continue if self._is_at_bracket: self._shift_residue() self._latest_linkage = self._parse_linkage() self._shift() continue if self._is_at_open_square_bracket: _latest_residue_before_square_bracket = self._fit_residue( self._latest_residue, increase_count=False ) self._latest_residue = _latest_residue_before_square_bracket self._push_residue_before_square_bracket(self._latest_residue) self._push_conformation_before_square_bracket(self._latest_conformation) # self._latest_conformation_before_square_bracket = ( # self._latest_conformation # ) self._shift() continue if self._is_at_close_square_bracket: self._latest_residue = self._pop_residue_before_square_bracket() self._latest_conformation = ( self._pop_conformation_before_square_bracket() ) # self._latest_residue = self._latest_residue_before_square_bracket # self._latest_conformation = ( # self._latest_conformation_before_square_bracket # ) self._second_latest_residue = "" self._second_latest_conformation = "" self._shift() continue self._latest_residue += self._current self._shift() self._latest_residue += self._current self._store() def _store(self): second = self._second_latest_residue if "@" not in second: second = self._fit_residue(second) self._second_latest_residue = second latest = self._latest_residue if not "@" in latest: latest = self._fit_residue(latest) self._latest_residue = latest branch = (second, latest, self._reformat_link(self._latest_linkage)) self._glycan.append(branch) self._latest_linkage = "" def _fit_residue(self, r, increase_count=True): if "@" in r: return r r = r[::-1] if r in self._residue_counts: if increase_count: self._residue_counts[r] += 1 r += "@" + str(self._residue_counts[r]) else: self._residue_counts[r] = 1 r += "@1" return r def _parse_linkage(self): self._shift() linkage = "" while self._can_shift and not self._is_at_bracket: linkage += self._current self._shift() self._latest_conformation = linkage[-1] linkage = linkage[:-1] return linkage def _crop_end(self): if self._string[-1] == "-": while self._next != "(": self._shift() self._latest_conformation = self._current self._shift() self._string = self._string[: -self._idx] self._idx = 1 else: self._latest_conformation = ( "a" # assume alpha conformation if not specified ) def _reformat_link(self, link): if "-" not in link: return link[::-1] link = link[::-1].replace("-", "") link = link + self._second_latest_conformation + self._latest_conformation return link def _prep_greek_letters(self, string): string = string.replace("α", "a").replace("β", "b") return string def _adjust_betas(self, segments): for i in range(len(segments)): a, b, link = segments[i] if link[2] == "b" and not a.startswith("b-"): a = "b-" + a if link[3] == "b" and not b.startswith("b-"): b = "b-" + b segments[i] = (a, b, link) return segments def __call__(self, *args, **kwds): return self.parse(*args, **kwds)
if __name__ == "__main__": parser = IUPACParser() segments = parser("Gal(b1-4)[Gal(b2-2)Glc(b1-6)]GlcNAc")