1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
|
import gzip
import itertools
import os
import platform
import re
import subprocess
import sys
import tempfile
import zipfile
from collections import namedtuple
import numpy as np
from openbabel import pybel
from openbabel.pybel import Atom
from plip.basic import config, logger
logger = logger.get_logger()
# Windows and MacOS
if (
os.name != "nt" and platform.system() != "Darwin"
): # Resource module not available for Windows
import resource
# Settings
np.seterr(all="ignore") # No runtime warnings
def tmpfile(prefix, direc):
"""Returns the path to a newly created temporary file."""
return tempfile.mktemp(prefix=prefix, suffix=".pdb", dir=direc)
def is_lig(hetid):
"""Checks if a PDB compound can be excluded as a small molecule ligand"""
h = hetid.upper()
return not (h == "HOH" or h in config.UNSUPPORTED)
def extract_pdbid(string):
"""Use regular expressions to get a PDB ID from a string"""
p = re.compile("[0-9][0-9a-z]{3}")
m = p.search(string.lower())
try:
return m.group()
except AttributeError:
return "UnknownProtein"
def whichrestype(atom):
"""Returns the residue name of an Pybel or OpenBabel atom."""
atom = (
atom if not isinstance(atom, Atom) else atom.OBAtom
) # Convert to OpenBabel Atom
return atom.GetResidue().GetName() if atom.GetResidue() is not None else None
def whichresnumber(atom):
"""Returns the residue number of an Pybel or OpenBabel atom (numbering as in original PDB file)."""
atom = (
atom if not isinstance(atom, Atom) else atom.OBAtom
) # Convert to OpenBabel Atom
return atom.GetResidue().GetNum() if atom.GetResidue() is not None else None
def whichchain(atom):
"""Returns the residue number of an PyBel or OpenBabel atom."""
atom = (
atom if not isinstance(atom, Atom) else atom.OBAtom
) # Convert to OpenBabel Atom
return atom.GetResidue().GetChain() if atom.GetResidue() is not None else None
#########################
# Mathematical operations
#########################
def euclidean3d(v1, v2):
"""Faster implementation of euclidean distance for the 3D case."""
if not len(v1) == 3 and len(v2) == 3:
return None
return np.sqrt((v1[0] - v2[0]) ** 2 + (v1[1] - v2[1]) ** 2 + (v1[2] - v2[2]) ** 2)
def vector(p1, p2):
"""Vector from p1 to p2.
:param p1: coordinates of point p1
:param p2: coordinates of point p2
:returns : numpy array with vector coordinates
"""
return (
None
if len(p1) != len(p2)
else np.array([p2[i] - p1[i] for i in range(len(p1))])
)
def vecangle(v1, v2, deg=True):
"""Calculate the angle between two vectors
:param v1: coordinates of vector v1
:param v2: coordinates of vector v2
:param deg: whether to return degrees or radians
:returns : angle in degree or rad
"""
if np.array_equal(v1, v2):
return 0.0
dm = np.dot(v1, v2)
cm = np.linalg.norm(v1) * np.linalg.norm(v2)
angle = np.arccos(dm / cm) # Round here to prevent floating point errors
return np.degrees([angle,])[0] if deg else angle
def normalize_vector(v):
"""Take a vector and return the normalized vector
:param v: a vector v
:returns : normalized vector v
"""
norm = np.linalg.norm(v)
return v / norm if not norm == 0 else v
def centroid(coo):
"""Calculates the centroid from a 3D point cloud and returns the coordinates
:param coo: Array of coordinate arrays
:returns : centroid coordinates as list
"""
return list(
map(
np.mean,
(([c[0] for c in coo]), ([c[1] for c in coo]), ([c[2] for c in coo])),
)
)
def projection(pnormal1, ppoint, tpoint):
"""Calculates the centroid from a 3D point cloud and returns the coordinates
:param pnormal1: normal of plane
:param ppoint: coordinates of point in the plane
:param tpoint: coordinates of point to be projected
:returns : coordinates of point orthogonally projected on the plane
"""
# Choose the plane normal pointing to the point to be projected
pnormal2 = [coo * (-1) for coo in pnormal1]
d1 = euclidean3d(tpoint, pnormal1 + ppoint)
d2 = euclidean3d(tpoint, pnormal2 + ppoint)
pnormal = pnormal1 if d1 < d2 else pnormal2
# Calculate the projection of tpoint to the plane
sn = -np.dot(pnormal, vector(ppoint, tpoint))
sd = np.dot(pnormal, pnormal)
sb = sn / sd
return [c1 + c2 for c1, c2 in zip(tpoint, [sb * pn for pn in pnormal])]
def cluster_doubles(double_list):
"""Given a list of doubles, they are clustered if they share one element
:param double_list: list of doubles
:returns : list of clusters (tuples)
"""
location = {} # hashtable of which cluster each element is in
clusters = []
# Go through each double
for t in double_list:
a, b = t[0], t[1]
# If they both are already in different clusters, merge the clusters
if a in location and b in location:
if location[a] != location[b]:
if location[a] < location[b]:
clusters[location[a]] = clusters[location[a]].union(
clusters[location[b]]
) # Merge clusters
clusters = clusters[: location[b]] + clusters[location[b] + 1 :]
else:
clusters[location[b]] = clusters[location[b]].union(
clusters[location[a]]
) # Merge clusters
clusters = clusters[: location[a]] + clusters[location[a] + 1 :]
# Rebuild index of locations for each element as they have changed now
location = {}
for i, cluster in enumerate(clusters):
for c in cluster:
location[c] = i
else:
# If a is already in a cluster, add b to that cluster
if a in location:
clusters[location[a]].add(b)
location[b] = location[a]
# If b is already in a cluster, add a to that cluster
if b in location:
clusters[location[b]].add(a)
location[a] = location[b]
# If neither a nor b is in any cluster, create a new one with a and b
if not (b in location and a in location):
clusters.append(set(t))
location[a] = len(clusters) - 1
location[b] = len(clusters) - 1
return map(tuple, clusters)
#################
# File operations
#################
def tilde_expansion(folder_path):
"""Tilde expansion, i.e. converts '~' in paths into <value of $HOME>."""
return os.path.expanduser(folder_path) if "~" in folder_path else folder_path
def folder_exists(folder_path):
"""Checks if a folder exists"""
return os.path.exists(folder_path)
def create_folder_if_not_exists(folder_path):
"""Creates a folder if it does not exists."""
folder_path = tilde_expansion(folder_path)
folder_path = (
"".join([folder_path, "/"]) if not folder_path[-1] == "/" else folder_path
)
direc = os.path.dirname(folder_path)
if not folder_exists(direc):
os.makedirs(direc)
def cmd_exists(c):
return (
subprocess.call(
"type " + c, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
== 0
)
################
# PyMOL-specific
################
def initialize_pymol(options):
"""Initializes PyMOL"""
import pymol
# Pass standard arguments of function to prevent PyMOL from printing out PDB headers (workaround)
pymol.finish_launching(args=["pymol", options, "-K"])
pymol.cmd.reinitialize()
def start_pymol(quiet=False, options="-p", run=False):
"""Starts up PyMOL and sets general options. Quiet mode suppresses all PyMOL output.
Command line options can be passed as the second argument."""
import pymol
pymol.pymol_argv = ["pymol", "%s" % options] + sys.argv[1:]
if run:
initialize_pymol(options)
if quiet:
pymol.cmd.feedback("disable", "all", "everything")
def nucleotide_linkage(residues):
"""Support for DNA/RNA ligands by finding missing covalent linkages to stitch DNA/RNA together."""
nuc_covalent = []
#######################################
# Basic support for RNA/DNA as ligand #
#######################################
nucleotides = ["A", "C", "T", "G", "U", "DA", "DC", "DT", "DG", "DU"]
dna_rna = {} # Dictionary of DNA/RNA residues by chain
covlinkage = namedtuple("covlinkage", "id1 chain1 pos1 conf1 id2 chain2 pos2 conf2")
# Create missing covlinkage entries for DNA/RNA
for ligand in residues:
resname, chain, pos = ligand
if resname in nucleotides:
if chain not in dna_rna:
dna_rna[chain] = [
(resname, pos),
]
else:
dna_rna[chain].append((resname, pos))
for chain in dna_rna:
nuc_list = dna_rna[chain]
for i, nucleotide in enumerate(nuc_list):
if not i == len(nuc_list) - 1:
name, pos = nucleotide
nextnucleotide = nuc_list[i + 1]
nextname, nextpos = nextnucleotide
newlink = covlinkage(
id1=name,
chain1=chain,
pos1=pos,
conf1="",
id2=nextname,
chain2=chain,
pos2=nextpos,
conf2="",
)
nuc_covalent.append(newlink)
return nuc_covalent
def ring_is_planar(ring, r_atoms):
"""Given a set of ring atoms, check if the ring is sufficiently planar
to be considered aromatic"""
normals = []
for a in r_atoms:
adj = pybel.ob.OBAtomAtomIter(a.OBAtom)
# Check for neighboring atoms in the ring
n_coords = [pybel.Atom(neigh).coords for neigh in adj if ring.IsMember(neigh)]
vec1, vec2 = vector(a.coords, n_coords[0]), vector(a.coords, n_coords[1])
normals.append(np.cross(vec1, vec2))
# Given all normals of ring atoms and their neighbors, the angle between any has to be 5.0 deg or less
for n1, n2 in itertools.product(normals, repeat=2):
arom_angle = vecangle(n1, n2)
if all(
[
arom_angle > config.AROMATIC_PLANARITY,
arom_angle < 180.0 - config.AROMATIC_PLANARITY,
]
):
return False
return True
def classify_by_name(names):
"""Classify a (composite) ligand by the HETID(s)"""
if len(names) > 3: # Polymer
if len(set(config.RNA).intersection(set(names))) != 0:
ligtype = "RNA"
elif len(set(config.DNA).intersection(set(names))) != 0:
ligtype = "DNA"
else:
ligtype = "POLYMER"
else:
ligtype = "SMALLMOLECULE"
for name in names:
if name in config.METAL_IONS:
if len(names) == 1:
ligtype = "ION"
else:
if "ION" not in ligtype:
ligtype += "+ION"
return ligtype
def sort_members_by_importance(members):
"""Sort the members of a composite ligand according to two criteria:
1. Split up in main and ion group. Ion groups are located behind the main group.
2. Within each group, sort by chain and position."""
main = [x for x in members if x[0] not in config.METAL_IONS]
ion = [x for x in members if x[0] in config.METAL_IONS]
sorted_main = sorted(main, key=lambda x: (x[1], x[2]))
sorted_main = sorted(main, key=lambda x: (x[1], x[2]))
sorted_ion = sorted(ion, key=lambda x: (x[1], x[2]))
return sorted_main + sorted_ion
def get_isomorphisms(reference, lig):
"""Get all isomorphisms of the ligand."""
query = pybel.ob.CompileMoleculeQuery(reference.OBMol)
mappr = pybel.ob.OBIsomorphismMapper.GetInstance(query)
if all:
isomorphs = pybel.ob.vvpairUIntUInt()
mappr.MapAll(lig.OBMol, isomorphs)
else:
isomorphs = pybel.ob.vpairUIntUInt()
mappr.MapFirst(lig.OBMol, isomorphs)
isomorphs = [isomorphs]
logger.debug(f"number of isomorphisms: {len(isomorphs)}")
# @todo Check which isomorphism to take
return isomorphs
def canonicalize(lig, preserve_bond_order=False):
"""Get the canonical atom order for the ligand."""
atomorder = None
# Get canonical atom order
lig = pybel.ob.OBMol(lig.OBMol)
if not preserve_bond_order:
for bond in pybel.ob.OBMolBondIter(lig):
if bond.GetBondOrder() != 1:
bond.SetBondOrder(1)
lig.DeleteData(pybel.ob.StereoData)
lig = pybel.Molecule(lig)
testcan = lig.write(format="can")
try:
pybel.readstring("can", testcan)
reference = pybel.readstring("can", testcan)
except IOError:
testcan, reference = "", ""
if testcan != "":
reference.removeh()
isomorphs = get_isomorphisms(
reference, lig
) # isomorphs now holds all isomorphisms within the molecule
if not len(isomorphs) == 0:
smi_dict = {}
smi_to_can = isomorphs[0]
for x in smi_to_can:
smi_dict[int(x[1]) + 1] = int(x[0]) + 1
atomorder = [smi_dict[x + 1] for x in range(len(lig.atoms))]
else:
atomorder = None
return atomorder
def int32_to_negative(int32):
"""Checks if a suspicious number (e.g. ligand position) is in fact a negative number represented as a
32 bit integer and returns the actual number.
"""
dct = {}
if (
int32 == 4294967295
): # Special case in some structures (note, this is just a workaround)
return -1
for i in range(-1000, -1):
dct[np.uint32(i)] = i
if int32 in dct:
return dct[int32]
else:
return int32
def read_pdb(pdbfname, as_string=False):
"""Reads a given PDB file and returns a Pybel Molecule."""
pybel.ob.obErrorLog.StopLogging() # Suppress all OpenBabel warnings
if os.name != "nt": # Resource module not available for Windows
maxsize = resource.getrlimit(resource.RLIMIT_STACK)[-1]
resource.setrlimit(resource.RLIMIT_STACK, (min(2 ** 28, maxsize), maxsize))
sys.setrecursionlimit(10 ** 5) # increase Python recursion limit
return readmol(pdbfname, as_string=as_string)
def read(fil):
"""Returns a file handler and detects gzipped files."""
if os.path.splitext(fil)[-1] == ".gz":
return gzip.open(fil, "rb")
elif os.path.splitext(fil)[-1] == ".zip":
zf = zipfile.ZipFile(fil, "r")
return zf.open(zf.infolist()[0].filename)
else:
return open(fil, "r")
def readmol(path, as_string=False):
"""Reads the given molecule file and returns the corresponding Pybel molecule as well as the input file type.
In contrast to the standard Pybel implementation, the file is closed properly."""
supported_formats = ["pdb"]
# Fix for Windows-generated files: Remove carriage return characters
if "\r" in path and as_string:
path = path.replace("\r", "")
for sformat in supported_formats:
obc = pybel.ob.OBConversion()
obc.SetInFormat(sformat)
logger.debug(
f"detected {sformat} as format, trying to read file with OpenBabel"
)
# Read molecules with single bond information
if as_string:
try:
mymol = pybel.readstring(sformat, path)
except IOError:
logger.error("no valid file format provided")
sys.exit(1)
else:
read_file = pybel.readfile(format=sformat, filename=path, opt={"s": None})
try:
mymol = next(read_file)
except StopIteration:
logger.error("file contains no valid molecules")
sys.exit(1)
logger.debug("molecule successfully read")
# Assign multiple bonds
mymol.OBMol.PerceiveBondOrders()
return mymol, sformat
logger.error("no valid file format provided")
sys.exit(1)
|