diff options
Diffstat (limited to 'plip/basic/supplemental.py')
-rw-r--r-- | plip/basic/supplemental.py | 434 |
1 files changed, 434 insertions, 0 deletions
diff --git a/plip/basic/supplemental.py b/plip/basic/supplemental.py new file mode 100644 index 0000000..6b3ef7b --- /dev/null +++ b/plip/basic/supplemental.py @@ -0,0 +1,434 @@ +import gzip +import itertools +import os +import platform +import re +import subprocess +import sys +import tempfile +import zipfile +from collections import namedtuple + +import numpy as np +from openbabel import pybel +from openbabel.pybel import Atom + +from plip.basic import config, logger + +logger = logger.get_logger() + +# Windows and MacOS +if os.name != 'nt' and platform.system() != 'Darwin': # Resource module not available for Windows + import resource + +# Settings +np.seterr(all='ignore') # No runtime warnings + + +def tmpfile(prefix, direc): + """Returns the path to a newly created temporary file.""" + return tempfile.mktemp(prefix=prefix, suffix='.pdb', dir=direc) + + +def is_lig(hetid): + """Checks if a PDB compound can be excluded as a small molecule ligand""" + h = hetid.upper() + return not (h == 'HOH' or h in config.UNSUPPORTED) + + +def extract_pdbid(string): + """Use regular expressions to get a PDB ID from a string""" + p = re.compile("[0-9][0-9a-z]{3}") + m = p.search(string.lower()) + try: + return m.group() + except AttributeError: + return "UnknownProtein" + + +def whichrestype(atom): + """Returns the residue name of an Pybel or OpenBabel atom.""" + atom = atom if not isinstance(atom, Atom) else atom.OBAtom # Convert to OpenBabel Atom + return atom.GetResidue().GetName() if atom.GetResidue() is not None else None + + +def whichresnumber(atom): + """Returns the residue number of an Pybel or OpenBabel atom (numbering as in original PDB file).""" + atom = atom if not isinstance(atom, Atom) else atom.OBAtom # Convert to OpenBabel Atom + return atom.GetResidue().GetNum() if atom.GetResidue() is not None else None + + +def whichchain(atom): + """Returns the residue number of an PyBel or OpenBabel atom.""" + atom = atom if not isinstance(atom, Atom) else atom.OBAtom # Convert to OpenBabel Atom + return atom.GetResidue().GetChain() if atom.GetResidue() is not None else None + + +######################### +# Mathematical operations +######################### + + +def euclidean3d(v1, v2): + """Faster implementation of euclidean distance for the 3D case.""" + if not len(v1) == 3 and len(v2) == 3: + return None + return np.sqrt((v1[0] - v2[0]) ** 2 + (v1[1] - v2[1]) ** 2 + (v1[2] - v2[2]) ** 2) + + +def vector(p1, p2): + """Vector from p1 to p2. + :param p1: coordinates of point p1 + :param p2: coordinates of point p2 + :returns : numpy array with vector coordinates + """ + return None if len(p1) != len(p2) else np.array([p2[i] - p1[i] for i in range(len(p1))]) + + +def vecangle(v1, v2, deg=True): + """Calculate the angle between two vectors + :param v1: coordinates of vector v1 + :param v2: coordinates of vector v2 + :param deg: whether to return degrees or radians + :returns : angle in degree or rad + """ + if np.array_equal(v1, v2): + return 0.0 + dm = np.dot(v1, v2) + cm = np.linalg.norm(v1) * np.linalg.norm(v2) + angle = np.arccos(dm / cm) # Round here to prevent floating point errors + return np.degrees([angle, ])[0] if deg else angle + + +def normalize_vector(v): + """Take a vector and return the normalized vector + :param v: a vector v + :returns : normalized vector v + """ + norm = np.linalg.norm(v) + return v / norm if not norm == 0 else v + + +def centroid(coo): + """Calculates the centroid from a 3D point cloud and returns the coordinates + :param coo: Array of coordinate arrays + :returns : centroid coordinates as list + """ + return list(map(np.mean, (([c[0] for c in coo]), ([c[1] for c in coo]), ([c[2] for c in coo])))) + + +def projection(pnormal1, ppoint, tpoint): + """Calculates the centroid from a 3D point cloud and returns the coordinates + :param pnormal1: normal of plane + :param ppoint: coordinates of point in the plane + :param tpoint: coordinates of point to be projected + :returns : coordinates of point orthogonally projected on the plane + """ + # Choose the plane normal pointing to the point to be projected + pnormal2 = [coo * (-1) for coo in pnormal1] + d1 = euclidean3d(tpoint, pnormal1 + ppoint) + d2 = euclidean3d(tpoint, pnormal2 + ppoint) + pnormal = pnormal1 if d1 < d2 else pnormal2 + # Calculate the projection of tpoint to the plane + sn = -np.dot(pnormal, vector(ppoint, tpoint)) + sd = np.dot(pnormal, pnormal) + sb = sn / sd + return [c1 + c2 for c1, c2 in zip(tpoint, [sb * pn for pn in pnormal])] + + +def cluster_doubles(double_list): + """Given a list of doubles, they are clustered if they share one element + :param double_list: list of doubles + :returns : list of clusters (tuples) + """ + location = {} # hashtable of which cluster each element is in + clusters = [] + # Go through each double + for t in double_list: + a, b = t[0], t[1] + # If they both are already in different clusters, merge the clusters + if a in location and b in location: + if location[a] != location[b]: + if location[a] < location[b]: + clusters[location[a]] = clusters[location[a]].union(clusters[location[b]]) # Merge clusters + clusters = clusters[:location[b]] + clusters[location[b] + 1:] + else: + clusters[location[b]] = clusters[location[b]].union(clusters[location[a]]) # Merge clusters + clusters = clusters[:location[a]] + clusters[location[a] + 1:] + # Rebuild index of locations for each element as they have changed now + location = {} + for i, cluster in enumerate(clusters): + for c in cluster: + location[c] = i + else: + # If a is already in a cluster, add b to that cluster + if a in location: + clusters[location[a]].add(b) + location[b] = location[a] + # If b is already in a cluster, add a to that cluster + if b in location: + clusters[location[b]].add(a) + location[a] = location[b] + # If neither a nor b is in any cluster, create a new one with a and b + if not (b in location and a in location): + clusters.append(set(t)) + location[a] = len(clusters) - 1 + location[b] = len(clusters) - 1 + return map(tuple, clusters) + + +################# +# File operations +################# + +def tilde_expansion(folder_path): + """Tilde expansion, i.e. converts '~' in paths into <value of $HOME>.""" + return os.path.expanduser(folder_path) if '~' in folder_path else folder_path + + +def folder_exists(folder_path): + """Checks if a folder exists""" + return os.path.exists(folder_path) + + +def create_folder_if_not_exists(folder_path): + """Creates a folder if it does not exists.""" + folder_path = tilde_expansion(folder_path) + folder_path = "".join([folder_path, '/']) if not folder_path[-1] == '/' else folder_path + direc = os.path.dirname(folder_path) + if not folder_exists(direc): + os.makedirs(direc) + + +def cmd_exists(c): + return subprocess.call("type " + c, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0 + + +################ +# PyMOL-specific +################ + + +def initialize_pymol(options): + """Initializes PyMOL""" + import pymol + # Pass standard arguments of function to prevent PyMOL from printing out PDB headers (workaround) + pymol.finish_launching(args=['pymol', options, '-K']) + pymol.cmd.reinitialize() + + +def start_pymol(quiet=False, options='-p', run=False): + """Starts up PyMOL and sets general options. Quiet mode suppresses all PyMOL output. + Command line options can be passed as the second argument.""" + import pymol + pymol.pymol_argv = ['pymol', '%s' % options] + sys.argv[1:] + if run: + initialize_pymol(options) + if quiet: + pymol.cmd.feedback('disable', 'all', 'everything') + + +def nucleotide_linkage(residues): + """Support for DNA/RNA ligands by finding missing covalent linkages to stitch DNA/RNA together.""" + + nuc_covalent = [] + ####################################### + # Basic support for RNA/DNA as ligand # + ####################################### + nucleotides = ['A', 'C', 'T', 'G', 'U', 'DA', 'DC', 'DT', 'DG', 'DU'] + dna_rna = {} # Dictionary of DNA/RNA residues by chain + covlinkage = namedtuple("covlinkage", "id1 chain1 pos1 conf1 id2 chain2 pos2 conf2") + # Create missing covlinkage entries for DNA/RNA + for ligand in residues: + resname, chain, pos = ligand + if resname in nucleotides: + if chain not in dna_rna: + dna_rna[chain] = [(resname, pos), ] + else: + dna_rna[chain].append((resname, pos)) + for chain in dna_rna: + nuc_list = dna_rna[chain] + for i, nucleotide in enumerate(nuc_list): + if not i == len(nuc_list) - 1: + name, pos = nucleotide + nextnucleotide = nuc_list[i + 1] + nextname, nextpos = nextnucleotide + newlink = covlinkage(id1=name, chain1=chain, pos1=pos, conf1='', + id2=nextname, chain2=chain, pos2=nextpos, conf2='') + nuc_covalent.append(newlink) + + return nuc_covalent + + +def ring_is_planar(ring, r_atoms): + """Given a set of ring atoms, check if the ring is sufficiently planar + to be considered aromatic""" + normals = [] + for a in r_atoms: + adj = pybel.ob.OBAtomAtomIter(a.OBAtom) + # Check for neighboring atoms in the ring + n_coords = [pybel.Atom(neigh).coords for neigh in adj if ring.IsMember(neigh)] + vec1, vec2 = vector(a.coords, n_coords[0]), vector(a.coords, n_coords[1]) + normals.append(np.cross(vec1, vec2)) + # Given all normals of ring atoms and their neighbors, the angle between any has to be 5.0 deg or less + for n1, n2 in itertools.product(normals, repeat=2): + arom_angle = vecangle(n1, n2) + if all([arom_angle > config.AROMATIC_PLANARITY, arom_angle < 180.0 - config.AROMATIC_PLANARITY]): + return False + return True + + +def classify_by_name(names): + """Classify a (composite) ligand by the HETID(s)""" + if len(names) > 3: # Polymer + if len(set(config.RNA).intersection(set(names))) != 0: + ligtype = 'RNA' + elif len(set(config.DNA).intersection(set(names))) != 0: + ligtype = 'DNA' + else: + ligtype = "POLYMER" + else: + ligtype = 'SMALLMOLECULE' + + for name in names: + if name in config.METAL_IONS: + if len(names) == 1: + ligtype = 'ION' + else: + if "ION" not in ligtype: + ligtype += '+ION' + return ligtype + + +def sort_members_by_importance(members): + """Sort the members of a composite ligand according to two criteria: + 1. Split up in main and ion group. Ion groups are located behind the main group. + 2. Within each group, sort by chain and position.""" + main = [x for x in members if x[0] not in config.METAL_IONS] + ion = [x for x in members if x[0] in config.METAL_IONS] + sorted_main = sorted(main, key=lambda x: (x[1], x[2])) + sorted_main = sorted(main, key=lambda x: (x[1], x[2])) + sorted_ion = sorted(ion, key=lambda x: (x[1], x[2])) + return sorted_main + sorted_ion + + +def get_isomorphisms(reference, lig): + """Get all isomorphisms of the ligand.""" + query = pybel.ob.CompileMoleculeQuery(reference.OBMol) + mappr = pybel.ob.OBIsomorphismMapper.GetInstance(query) + if all: + isomorphs = pybel.ob.vvpairUIntUInt() + mappr.MapAll(lig.OBMol, isomorphs) + else: + isomorphs = pybel.ob.vpairUIntUInt() + mappr.MapFirst(lig.OBMol, isomorphs) + isomorphs = [isomorphs] + logger.debug(f'number of isomorphisms: {len(isomorphs)}') + # @todo Check which isomorphism to take + return isomorphs + + +def canonicalize(lig, preserve_bond_order=False): + """Get the canonical atom order for the ligand.""" + atomorder = None + # Get canonical atom order + + lig = pybel.ob.OBMol(lig.OBMol) + if not preserve_bond_order: + for bond in pybel.ob.OBMolBondIter(lig): + if bond.GetBondOrder() != 1: + bond.SetBondOrder(1) + lig.DeleteData(pybel.ob.StereoData) + lig = pybel.Molecule(lig) + testcan = lig.write(format='can') + try: + pybel.readstring('can', testcan) + reference = pybel.readstring('can', testcan) + except IOError: + testcan, reference = '', '' + if testcan != '': + reference.removeh() + isomorphs = get_isomorphisms(reference, lig) # isomorphs now holds all isomorphisms within the molecule + if not len(isomorphs) == 0: + smi_dict = {} + smi_to_can = isomorphs[0] + for x in smi_to_can: + smi_dict[int(x[1]) + 1] = int(x[0]) + 1 + atomorder = [smi_dict[x + 1] for x in range(len(lig.atoms))] + else: + atomorder = None + return atomorder + + +def int32_to_negative(int32): + """Checks if a suspicious number (e.g. ligand position) is in fact a negative number represented as a + 32 bit integer and returns the actual number. + """ + dct = {} + if int32 == 4294967295: # Special case in some structures (note, this is just a workaround) + return -1 + for i in range(-1000, -1): + dct[np.uint32(i)] = i + if int32 in dct: + return dct[int32] + else: + return int32 + + +def read_pdb(pdbfname, as_string=False): + """Reads a given PDB file and returns a Pybel Molecule.""" + pybel.ob.obErrorLog.StopLogging() # Suppress all OpenBabel warnings + if os.name != 'nt': # Resource module not available for Windows + maxsize = resource.getrlimit(resource.RLIMIT_STACK)[-1] + resource.setrlimit(resource.RLIMIT_STACK, (min(2 ** 28, maxsize), maxsize)) + sys.setrecursionlimit(10 ** 5) # increase Python recursion limit + return readmol(pdbfname, as_string=as_string) + + +def read(fil): + """Returns a file handler and detects gzipped files.""" + if os.path.splitext(fil)[-1] == '.gz': + return gzip.open(fil, 'rb') + elif os.path.splitext(fil)[-1] == '.zip': + zf = zipfile.ZipFile(fil, 'r') + return zf.open(zf.infolist()[0].filename) + else: + return open(fil, 'r') + + +def readmol(path, as_string=False): + """Reads the given molecule file and returns the corresponding Pybel molecule as well as the input file type. + In contrast to the standard Pybel implementation, the file is closed properly.""" + supported_formats = ['pdb'] + # Fix for Windows-generated files: Remove carriage return characters + if "\r" in path and as_string: + path = path.replace('\r', '') + + for sformat in supported_formats: + obc = pybel.ob.OBConversion() + obc.SetInFormat(sformat) + logger.debug(f'detected {sformat} as format, trying to read file with OpenBabel') + + # Read molecules with single bond information + if as_string: + try: + mymol = pybel.readstring(sformat, path) + except IOError: + logger.error('no valid file format provided') + sys.exit(1) + else: + read_file = pybel.readfile(format=sformat, filename=path, opt={"s": None}) + try: + mymol = next(read_file) + except StopIteration: + logger.error('file contains no valid molecules') + sys.exit(1) + + logger.debug('molecule successfully read') + + # Assign multiple bonds + mymol.OBMol.PerceiveBondOrders() + return mymol, sformat + + logger.error('no valid file format provided') + sys.exit(1) |