plip/basic/supplemental.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486

import gzip
import itertools
import os
import platform
import re
import subprocess
import sys
import tempfile
import zipfile
from collections import namedtuple

import numpy as np
from openbabel import pybel
from openbabel.pybel import Atom

from plip.basic import config, logger

logger = logger.get_logger()

# Windows and MacOS
if (
    os.name != "nt" and platform.system() != "Darwin"
):  # Resource module not available for Windows
    import resource

# Settings
np.seterr(all="ignore")  # No runtime warnings


def tmpfile(prefix, direc):
    """Returns the path to a newly created temporary file."""
    return tempfile.mktemp(prefix=prefix, suffix=".pdb", dir=direc)


def is_lig(hetid):
    """Checks if a PDB compound can be excluded as a small molecule ligand"""
    h = hetid.upper()
    return not (h == "HOH" or h in config.UNSUPPORTED)


def extract_pdbid(string):
    """Use regular expressions to get a PDB ID from a string"""
    p = re.compile("[0-9][0-9a-z]{3}")
    m = p.search(string.lower())
    try:
        return m.group()
    except AttributeError:
        return "UnknownProtein"


def whichrestype(atom):
    """Returns the residue name of an Pybel or OpenBabel atom."""
    atom = (
        atom if not isinstance(atom, Atom) else atom.OBAtom
    )  # Convert to OpenBabel Atom
    return atom.GetResidue().GetName() if atom.GetResidue() is not None else None


def whichresnumber(atom):
    """Returns the residue number of an Pybel or OpenBabel atom (numbering as in original PDB file)."""
    atom = (
        atom if not isinstance(atom, Atom) else atom.OBAtom
    )  # Convert to OpenBabel Atom
    return atom.GetResidue().GetNum() if atom.GetResidue() is not None else None


def whichchain(atom):
    """Returns the residue number of an PyBel or OpenBabel atom."""
    atom = (
        atom if not isinstance(atom, Atom) else atom.OBAtom
    )  # Convert to OpenBabel Atom
    return atom.GetResidue().GetChain() if atom.GetResidue() is not None else None


#########################
# Mathematical operations
#########################


def euclidean3d(v1, v2):
    """Faster implementation of euclidean distance for the 3D case."""
    if not len(v1) == 3 and len(v2) == 3:
        return None
    return np.sqrt((v1[0] - v2[0]) ** 2 + (v1[1] - v2[1]) ** 2 + (v1[2] - v2[2]) ** 2)


def vector(p1, p2):
    """Vector from p1 to p2.
    :param p1: coordinates of point p1
    :param p2: coordinates of point p2
    :returns : numpy array with vector coordinates
    """
    return (
        None
        if len(p1) != len(p2)
        else np.array([p2[i] - p1[i] for i in range(len(p1))])
    )


def vecangle(v1, v2, deg=True):
    """Calculate the angle between two vectors
    :param v1: coordinates of vector v1
    :param v2: coordinates of vector v2
    :param deg: whether to return degrees or radians
    :returns : angle in degree or rad
    """
    if np.array_equal(v1, v2):
        return 0.0
    dm = np.dot(v1, v2)
    cm = np.linalg.norm(v1) * np.linalg.norm(v2)
    angle = np.arccos(dm / cm)  # Round here to prevent floating point errors
    return np.degrees([angle,])[0] if deg else angle


def normalize_vector(v):
    """Take a vector and return the normalized vector
    :param v: a vector v
    :returns : normalized vector v
    """
    norm = np.linalg.norm(v)
    return v / norm if not norm == 0 else v


def centroid(coo):
    """Calculates the centroid from a 3D point cloud and returns the coordinates
    :param coo: Array of coordinate arrays
    :returns : centroid coordinates as list
    """
    return list(
        map(
            np.mean,
            (([c[0] for c in coo]), ([c[1] for c in coo]), ([c[2] for c in coo])),
        )
    )


def projection(pnormal1, ppoint, tpoint):
    """Calculates the centroid from a 3D point cloud and returns the coordinates
    :param pnormal1: normal of plane
    :param ppoint: coordinates of point in the plane
    :param tpoint: coordinates of point to be projected
    :returns : coordinates of point orthogonally projected on the plane
    """
    # Choose the plane normal pointing to the point to be projected
    pnormal2 = [coo * (-1) for coo in pnormal1]
    d1 = euclidean3d(tpoint, pnormal1 + ppoint)
    d2 = euclidean3d(tpoint, pnormal2 + ppoint)
    pnormal = pnormal1 if d1 < d2 else pnormal2
    # Calculate the projection of tpoint to the plane
    sn = -np.dot(pnormal, vector(ppoint, tpoint))
    sd = np.dot(pnormal, pnormal)
    sb = sn / sd
    return [c1 + c2 for c1, c2 in zip(tpoint, [sb * pn for pn in pnormal])]


def cluster_doubles(double_list):
    """Given a list of doubles, they are clustered if they share one element
    :param double_list: list of doubles
    :returns : list of clusters (tuples)
    """
    location = {}  # hashtable of which cluster each element is in
    clusters = []
    # Go through each double
    for t in double_list:
        a, b = t[0], t[1]
        # If they both are already in different clusters, merge the clusters
        if a in location and b in location:
            if location[a] != location[b]:
                if location[a] < location[b]:
                    clusters[location[a]] = clusters[location[a]].union(
                        clusters[location[b]]
                    )  # Merge clusters
                    clusters = clusters[: location[b]] + clusters[location[b] + 1 :]
                else:
                    clusters[location[b]] = clusters[location[b]].union(
                        clusters[location[a]]
                    )  # Merge clusters
                    clusters = clusters[: location[a]] + clusters[location[a] + 1 :]
                # Rebuild index of locations for each element as they have changed now
                location = {}
                for i, cluster in enumerate(clusters):
                    for c in cluster:
                        location[c] = i
        else:
            # If a is already in a cluster, add b to that cluster
            if a in location:
                clusters[location[a]].add(b)
                location[b] = location[a]
            # If b is already in a cluster, add a to that cluster
            if b in location:
                clusters[location[b]].add(a)
                location[a] = location[b]
            # If neither a nor b is in any cluster, create a new one with a and b
            if not (b in location and a in location):
                clusters.append(set(t))
                location[a] = len(clusters) - 1
                location[b] = len(clusters) - 1
    return map(tuple, clusters)


#################
# File operations
#################


def tilde_expansion(folder_path):
    """Tilde expansion, i.e. converts '~' in paths into <value of $HOME>."""
    return os.path.expanduser(folder_path) if "~" in folder_path else folder_path


def folder_exists(folder_path):
    """Checks if a folder exists"""
    return os.path.exists(folder_path)


def create_folder_if_not_exists(folder_path):
    """Creates a folder if it does not exists."""
    folder_path = tilde_expansion(folder_path)
    folder_path = (
        "".join([folder_path, "/"]) if not folder_path[-1] == "/" else folder_path
    )
    direc = os.path.dirname(folder_path)
    if not folder_exists(direc):
        os.makedirs(direc)


def cmd_exists(c):
    return (
        subprocess.call(
            "type " + c, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        == 0
    )


################
# PyMOL-specific
################


def initialize_pymol(options):
    """Initializes PyMOL"""
    import pymol

    # Pass standard arguments of function to prevent PyMOL from printing out PDB headers (workaround)
    pymol.finish_launching(args=["pymol", options, "-K"])
    pymol.cmd.reinitialize()


def start_pymol(quiet=False, options="-p", run=False):
    """Starts up PyMOL and sets general options. Quiet mode suppresses all PyMOL output.
    Command line options can be passed as the second argument."""
    import pymol

    pymol.pymol_argv = ["pymol", "%s" % options] + sys.argv[1:]
    if run:
        initialize_pymol(options)
    if quiet:
        pymol.cmd.feedback("disable", "all", "everything")


def nucleotide_linkage(residues):
    """Support for DNA/RNA ligands by finding missing covalent linkages to stitch DNA/RNA together."""

    nuc_covalent = []
    #######################################
    # Basic support for RNA/DNA as ligand #
    #######################################
    nucleotides = ["A", "C", "T", "G", "U", "DA", "DC", "DT", "DG", "DU"]
    dna_rna = {}  # Dictionary of DNA/RNA residues by chain
    covlinkage = namedtuple("covlinkage", "id1 chain1 pos1 conf1 id2 chain2 pos2 conf2")
    # Create missing covlinkage entries for DNA/RNA
    for ligand in residues:
        resname, chain, pos = ligand
        if resname in nucleotides:
            if chain not in dna_rna:
                dna_rna[chain] = [
                    (resname, pos),
                ]
            else:
                dna_rna[chain].append((resname, pos))
    for chain in dna_rna:
        nuc_list = dna_rna[chain]
        for i, nucleotide in enumerate(nuc_list):
            if not i == len(nuc_list) - 1:
                name, pos = nucleotide
                nextnucleotide = nuc_list[i + 1]
                nextname, nextpos = nextnucleotide
                newlink = covlinkage(
                    id1=name,
                    chain1=chain,
                    pos1=pos,
                    conf1="",
                    id2=nextname,
                    chain2=chain,
                    pos2=nextpos,
                    conf2="",
                )
                nuc_covalent.append(newlink)

    return nuc_covalent


def ring_is_planar(ring, r_atoms):
    """Given a set of ring atoms, check if the ring is sufficiently planar
    to be considered aromatic"""
    normals = []
    for a in r_atoms:
        adj = pybel.ob.OBAtomAtomIter(a.OBAtom)
        # Check for neighboring atoms in the ring
        n_coords = [pybel.Atom(neigh).coords for neigh in adj if ring.IsMember(neigh)]
        vec1, vec2 = vector(a.coords, n_coords[0]), vector(a.coords, n_coords[1])
        normals.append(np.cross(vec1, vec2))
    # Given all normals of ring atoms and their neighbors, the angle between any has to be 5.0 deg or less
    for n1, n2 in itertools.product(normals, repeat=2):
        arom_angle = vecangle(n1, n2)
        if all(
            [
                arom_angle > config.AROMATIC_PLANARITY,
                arom_angle < 180.0 - config.AROMATIC_PLANARITY,
            ]
        ):
            return False
    return True


def classify_by_name(names):
    """Classify a (composite) ligand by the HETID(s)"""
    if len(names) > 3:  # Polymer
        if len(set(config.RNA).intersection(set(names))) != 0:
            ligtype = "RNA"
        elif len(set(config.DNA).intersection(set(names))) != 0:
            ligtype = "DNA"
        else:
            ligtype = "POLYMER"
    else:
        ligtype = "SMALLMOLECULE"

    for name in names:
        if name in config.METAL_IONS:
            if len(names) == 1:
                ligtype = "ION"
            else:
                if "ION" not in ligtype:
                    ligtype += "+ION"
    return ligtype


def sort_members_by_importance(members):
    """Sort the members of a composite ligand according to two criteria:
    1. Split up in main and ion group. Ion groups are located behind the main group.
    2. Within each group, sort by chain and position."""
    main = [x for x in members if x[0] not in config.METAL_IONS]
    ion = [x for x in members if x[0] in config.METAL_IONS]
    sorted_main = sorted(main, key=lambda x: (x[1], x[2]))
    sorted_main = sorted(main, key=lambda x: (x[1], x[2]))
    sorted_ion = sorted(ion, key=lambda x: (x[1], x[2]))
    return sorted_main + sorted_ion


def get_isomorphisms(reference, lig):
    """Get all isomorphisms of the ligand."""
    query = pybel.ob.CompileMoleculeQuery(reference.OBMol)
    mappr = pybel.ob.OBIsomorphismMapper.GetInstance(query)
    if all:
        isomorphs = pybel.ob.vvpairUIntUInt()
        mappr.MapAll(lig.OBMol, isomorphs)
    else:
        isomorphs = pybel.ob.vpairUIntUInt()
        mappr.MapFirst(lig.OBMol, isomorphs)
        isomorphs = [isomorphs]
    logger.debug(f"number of isomorphisms: {len(isomorphs)}")
    # @todo Check which isomorphism to take
    return isomorphs


def canonicalize(lig, preserve_bond_order=False):
    """Get the canonical atom order for the ligand."""
    atomorder = None
    # Get canonical atom order

    lig = pybel.ob.OBMol(lig.OBMol)
    if not preserve_bond_order:
        for bond in pybel.ob.OBMolBondIter(lig):
            if bond.GetBondOrder() != 1:
                bond.SetBondOrder(1)
    lig.DeleteData(pybel.ob.StereoData)
    lig = pybel.Molecule(lig)
    testcan = lig.write(format="can")
    try:
        pybel.readstring("can", testcan)
        reference = pybel.readstring("can", testcan)
    except IOError:
        testcan, reference = "", ""
    if testcan != "":
        reference.removeh()
        isomorphs = get_isomorphisms(
            reference, lig
        )  # isomorphs now holds all isomorphisms within the molecule
        if not len(isomorphs) == 0:
            smi_dict = {}
            smi_to_can = isomorphs[0]
            for x in smi_to_can:
                smi_dict[int(x[1]) + 1] = int(x[0]) + 1
            atomorder = [smi_dict[x + 1] for x in range(len(lig.atoms))]
        else:
            atomorder = None
    return atomorder


def int32_to_negative(int32):
    """Checks if a suspicious number (e.g. ligand position) is in fact a negative number represented as a
    32 bit integer and returns the actual number.
    """
    dct = {}
    if (
        int32 == 4294967295
    ):  # Special case in some structures (note, this is just a workaround)
        return -1
    for i in range(-1000, -1):
        dct[np.uint32(i)] = i
    if int32 in dct:
        return dct[int32]
    else:
        return int32


def read_pdb(pdbfname, as_string=False):
    """Reads a given PDB file and returns a Pybel Molecule."""
    pybel.ob.obErrorLog.StopLogging()  # Suppress all OpenBabel warnings
    if os.name != "nt":  # Resource module not available for Windows
        maxsize = resource.getrlimit(resource.RLIMIT_STACK)[-1]
        resource.setrlimit(resource.RLIMIT_STACK, (min(2 ** 28, maxsize), maxsize))
    sys.setrecursionlimit(10 ** 5)  # increase Python recursion limit
    return readmol(pdbfname, as_string=as_string)


def read(fil):
    """Returns a file handler and detects gzipped files."""
    if os.path.splitext(fil)[-1] == ".gz":
        return gzip.open(fil, "rb")
    elif os.path.splitext(fil)[-1] == ".zip":
        zf = zipfile.ZipFile(fil, "r")
        return zf.open(zf.infolist()[0].filename)
    else:
        return open(fil, "r")


def readmol(path, as_string=False):
    """Reads the given molecule file and returns the corresponding Pybel molecule as well as the input file type.
    In contrast to the standard Pybel implementation, the file is closed properly."""
    supported_formats = ["pdb"]
    # Fix for Windows-generated files: Remove carriage return characters
    if "\r" in path and as_string:
        path = path.replace("\r", "")

    for sformat in supported_formats:
        obc = pybel.ob.OBConversion()
        obc.SetInFormat(sformat)
        logger.debug(
            f"detected {sformat} as format, trying to read file with OpenBabel"
        )

        # Read molecules with single bond information
        if as_string:
            try:
                mymol = pybel.readstring(sformat, path)
            except IOError:
                logger.error("no valid file format provided")
                sys.exit(1)
        else:
            read_file = pybel.readfile(format=sformat, filename=path, opt={"s": None})
            try:
                mymol = next(read_file)
            except StopIteration:
                logger.error("file contains no valid molecules")
                sys.exit(1)

        logger.debug("molecule successfully read")

        # Assign multiple bonds
        mymol.OBMol.PerceiveBondOrders()
        return mymol, sformat

    logger.error("no valid file format provided")
    sys.exit(1)