aboutsummaryrefslogtreecommitdiff
path: root/plip/structure/preparation.py
blob: c2631d3ee6e8c09f03be24f7deabc570318783b8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
import itertools
import os
import re
import tempfile
from collections import namedtuple
from operator import itemgetter

import numpy as np
from openbabel import pybel

from plip.basic import config, logger
from plip.basic.supplemental import centroid, tilde_expansion, tmpfile, classify_by_name
from plip.basic.supplemental import cluster_doubles, is_lig, normalize_vector, vector, ring_is_planar
from plip.basic.supplemental import extract_pdbid, read_pdb, create_folder_if_not_exists, canonicalize
from plip.basic.supplemental import read, nucleotide_linkage, sort_members_by_importance
from plip.basic.supplemental import whichchain, whichrestype, whichresnumber, euclidean3d, int32_to_negative
from plip.structure.detection import halogen, pication, water_bridges, metal_complexation
from plip.structure.detection import hydrophobic_interactions, pistacking, hbonds, saltbridge

logger = logger.get_logger()


class PDBParser:
    def __init__(self, pdbpath, as_string):
        self.as_string = as_string
        self.pdbpath = pdbpath
        self.num_fixed_lines = 0
        self.covlinkage = namedtuple("covlinkage", "id1 chain1 pos1 conf1 id2 chain2 pos2 conf2")
        self.proteinmap, self.modres, self.covalent, self.altconformations, self.corrected_pdb = self.parse_pdb()

    def parse_pdb(self):
        """Extracts additional information from PDB files.
        I. When reading in a PDB file, OpenBabel numbers ATOMS and HETATOMS continously.
        In PDB files, TER records are also counted, leading to a different numbering system.
        This functions reads in a PDB file and provides a mapping as a dictionary.
        II. Additionally, it returns a list of modified residues.
        III. Furthermore, covalent linkages between ligands and protein residues/other ligands are identified
        IV. Alternative conformations
        """
        if self.as_string:
            fil = self.pdbpath.rstrip('\n').split('\n')  # Removing trailing newline character
        else:
            f = read(self.pdbpath)
            fil = f.readlines()
            f.close()
        corrected_lines = []
        i, j = 0, 0  # idx and PDB numbering
        d = {}
        modres = set()
        covalent = []
        alt = []
        previous_ter = False

        # Standard without fixing
        if not config.NOFIX:
            if not config.PLUGIN_MODE:
                lastnum = 0  # Atom numbering (has to be consecutive)
                other_models = False
                for line in fil:
                    if not other_models:  # Only consider the first model in an NRM structure
                        corrected_line, newnum = self.fix_pdbline(line, lastnum)
                        if corrected_line is not None:
                            if corrected_line.startswith('MODEL'):
                                try:  # Get number of MODEL (1,2,3)
                                    model_num = int(corrected_line[10:14])
                                    if model_num > 1:  # MODEL 2,3,4 etc.
                                        other_models = True
                                except ValueError:
                                    logger.debug(f'ignoring invalid MODEL entry: {corrected_line}')
                            corrected_lines.append(corrected_line)
                            lastnum = newnum
                corrected_pdb = ''.join(corrected_lines)
            else:
                corrected_pdb = self.pdbpath
                corrected_lines = fil
        else:
            corrected_pdb = self.pdbpath
            corrected_lines = fil

        for line in corrected_lines:
            if line.startswith(("ATOM", "HETATM")):
                # Retrieve alternate conformations
                atomid, location = int(line[6:11]), line[16]
                location = 'A' if location == ' ' else location
                if location != 'A':
                    alt.append(atomid)

                if not previous_ter:
                    i += 1
                    j += 1
                else:
                    i += 1
                    j += 2
                d[i] = j
                previous_ter = False
            # Numbering Changes at TER records
            if line.startswith("TER"):
                previous_ter = True
            # Get modified residues
            if line.startswith("MODRES"):
                modres.add(line[12:15].strip())
            # Get covalent linkages between ligands
            if line.startswith("LINK"):
                covalent.append(self.get_linkage(line))
        return d, modres, covalent, alt, corrected_pdb

    def fix_pdbline(self, pdbline, lastnum):
        """Fix a PDB line if information is missing."""
        pdbqt_conversion = {
            "HD": "H", "HS": "H", "NA": "N",
            "NS": "N", "OA": "O", "OS": "O", "SA": "S"}
        fixed = False
        new_num = 0
        forbidden_characters = "[^a-zA-Z0-9_]"
        pdbline = pdbline.strip('\n')
        # Some MD / Docking tools produce empty lines, leading to segfaults
        if len(pdbline.strip()) == 0:
            self.num_fixed_lines += 1
            return None, lastnum
        if len(pdbline) > 100:  # Should be 80 long
            self.num_fixed_lines += 1
            return None, lastnum
        # TER Entries also have continuing numbering, consider them as well
        if pdbline.startswith('TER'):
            new_num = lastnum + 1
        if pdbline.startswith('ATOM'):
            new_num = lastnum + 1
            current_num = int(pdbline[6:11])
            resnum = pdbline[22:27].strip()
            resname = pdbline[17:21].strip()
            # Invalid residue number
            try:
                int(resnum)
            except ValueError:
                pdbline = pdbline[:22] + '   0 ' + pdbline[27:]
                fixed = True
            # Invalid characters in residue name
            if re.match(forbidden_characters, resname.strip()):
                pdbline = pdbline[:17] + 'UNK ' + pdbline[21:]
                fixed = True
            if lastnum + 1 != current_num:
                pdbline = pdbline[:6] + (5 - len(str(new_num))) * ' ' + str(new_num) + ' ' + pdbline[12:]
                fixed = True
            # No chain assigned
            if pdbline[21] == ' ':
                pdbline = pdbline[:21] + 'A' + pdbline[22:]
                fixed = True
            if pdbline.endswith('H'):
                self.num_fixed_lines += 1
                return None, lastnum
            # Sometimes, converted PDB structures contain PDBQT atom types. Fix that.
            for pdbqttype in pdbqt_conversion:
                if pdbline.strip().endswith(pdbqttype):
                    pdbline = pdbline.strip()[:-2] + ' ' + pdbqt_conversion[pdbqttype] + '\n'
                    self.num_fixed_lines += 1
        if pdbline.startswith('HETATM'):
            new_num = lastnum + 1
            try:
                current_num = int(pdbline[6:11])
            except ValueError:
                current_num = None
                logger.debug(f'invalid HETATM entry: {pdbline}')
            if lastnum + 1 != current_num:
                pdbline = pdbline[:6] + (5 - len(str(new_num))) * ' ' + str(new_num) + ' ' + pdbline[12:]
                fixed = True
            # No chain assigned or number assigned as chain
            if pdbline[21] == ' ':
                pdbline = pdbline[:21] + 'Z' + pdbline[22:]
                fixed = True
            # No residue number assigned
            if pdbline[23:26] == '   ':
                pdbline = pdbline[:23] + '999' + pdbline[26:]
                fixed = True
            # Non-standard Ligand Names
            ligname = pdbline[17:21].strip()
            if len(ligname) > 3:
                pdbline = pdbline[:17] + ligname[:3] + ' ' + pdbline[21:]
                fixed = True
            if re.match(forbidden_characters, ligname.strip()):
                pdbline = pdbline[:17] + 'LIG ' + pdbline[21:]
                fixed = True
            if len(ligname.strip()) == 0:
                pdbline = pdbline[:17] + 'LIG ' + pdbline[21:]
                fixed = True
            if pdbline.endswith('H'):
                self.num_fixed_lines += 1
                return None, lastnum
            # Sometimes, converted PDB structures contain PDBQT atom types. Fix that.
            for pdbqttype in pdbqt_conversion:
                if pdbline.strip().endswith(pdbqttype):
                    pdbline = pdbline.strip()[:-2] + ' ' + pdbqt_conversion[pdbqttype] + ' '
                    self.num_fixed_lines += 1
        self.num_fixed_lines += 1 if fixed else 0
        return pdbline + '\n', max(new_num, lastnum)

    def get_linkage(self, line):
        """Get the linkage information from a LINK entry PDB line."""
        conf1, id1, chain1, pos1 = line[16].strip(), line[17:20].strip(), line[21].strip(), int(line[22:26])
        conf2, id2, chain2, pos2 = line[46].strip(), line[47:50].strip(), line[51].strip(), int(line[52:56])
        return self.covlinkage(id1=id1, chain1=chain1, pos1=pos1, conf1=conf1,
                               id2=id2, chain2=chain2, pos2=pos2, conf2=conf2)


class LigandFinder:
    def __init__(self, proteincomplex, altconf, modres, covalent, mapper):
        self.lignames_all = None
        self.lignames_kept = None
        self.water = None
        self.proteincomplex = proteincomplex
        self.altconformations = altconf
        self.modresidues = modres
        self.covalent = covalent
        self.mapper = mapper
        self.ligands = self.getligs()
        self.excluded = sorted(list(self.lignames_all.difference(set(self.lignames_kept))))

    def getpeptides(self, chain):
        """If peptide ligand chains are defined via the command line options,
        try to extract the underlying ligand formed by all residues in the
        given chain without water
        """
        all_from_chain = [o for o in pybel.ob.OBResidueIter(
            self.proteincomplex.OBMol) if o.GetChain() == chain]  # All residues from chain
        if len(all_from_chain) == 0:
            return None
        else:
            non_water = [o for o in all_from_chain if not o.GetResidueProperty(9)]
            ligand = self.extract_ligand(non_water)
            return ligand

    def getligs(self):
        """Get all ligands from a PDB file and prepare them for analysis.
        Returns all non-empty ligands.
        """

        if config.PEPTIDES == [] and config.INTRA is None:
            # Extract small molecule ligands (default)
            ligands = []

            # Filter for ligands using lists
            ligand_residues, self.lignames_all, self.water = self.filter_for_ligands()

            all_res_dict = {(a.GetName(), a.GetChain(), a.GetNum()): a for a in ligand_residues}
            self.lignames_kept = list(set([a.GetName() for a in ligand_residues]))

            if not config.BREAKCOMPOSITE:
                #  Update register of covalent links with those between DNA/RNA subunits
                self.covalent += nucleotide_linkage(all_res_dict)
                #  Find fragment linked by covalent bonds
                res_kmers = self.identify_kmers(all_res_dict)
            else:
                res_kmers = [[a, ] for a in ligand_residues]
            logger.debug(f'{len(res_kmers)} ligand kmer(s) detected for closer inspection')
            for kmer in res_kmers:  # iterate over all ligands and extract molecules + information
                if len(kmer) > config.MAX_COMPOSITE_LENGTH:
                    logger.debug(
                        f'ligand kmer(s) filtered out with a length of {len(kmer)} fragments ({config.MAX_COMPOSITE_LENGTH} allowed)')
                else:
                    ligands.append(self.extract_ligand(kmer))

        else:
            # Extract peptides from given chains
            self.water = [o for o in pybel.ob.OBResidueIter(self.proteincomplex.OBMol) if o.GetResidueProperty(9)]
            if config.PEPTIDES:
                peptide_ligands = [self.getpeptides(chain) for chain in config.PEPTIDES]
            elif config.INTRA is not None:
                peptide_ligands = [self.getpeptides(config.INTRA), ]

            ligands = [p for p in peptide_ligands if p is not None]
            self.covalent, self.lignames_kept, self.lignames_all = [], [], set()

        return [lig for lig in ligands if len(lig.mol.atoms) != 0]

    def extract_ligand(self, kmer):
        """Extract the ligand by copying atoms and bonds and assign all information necessary for later steps."""
        data = namedtuple('ligand', 'mol hetid chain position water members longname type atomorder can_to_pdb')
        members = [(res.GetName(), res.GetChain(), int32_to_negative(res.GetNum())) for res in kmer]
        members = sort_members_by_importance(members)
        rname, rchain, rnum = members[0]
        logger.debug(f'finalizing extraction for ligand {rname}:{rchain}:{rnum} with {len(kmer)} elements')
        names = [x[0] for x in members]
        longname = '-'.join([x[0] for x in members])

        if config.PEPTIDES:
            ligtype = 'PEPTIDE'
        elif config.INTRA is not None:
            ligtype = 'INTRA'
        else:
            # Classify a ligand by its HETID(s)
            ligtype = classify_by_name(names)
        logger.debug(f'ligand classified as {ligtype}')

        hetatoms = dict()
        for obresidue in kmer:
            cur_hetatoms = {obatom.GetIdx(): obatom for obatom in pybel.ob.OBResidueAtomIter(obresidue) if
                            obatom.GetAtomicNum() != 1}
            if not config.ALTLOC:
                # Remove alternative conformations (standard -> True)
                ids_to_remove = [atom_id for atom_id in hetatoms.keys() if
                                 self.mapper.mapid(atom_id, mtype='protein', to='internal') in self.altconformations]
                for atom_id in ids_to_remove:
                    del cur_hetatoms[atom_id]
            hetatoms.update(cur_hetatoms)
        logger.debug(f'hetero atoms determined (n={len(hetatoms)})')

        lig = pybel.ob.OBMol()  # new ligand mol
        neighbours = dict()
        for obatom in hetatoms.values():  # iterate over atom objects
            idx = obatom.GetIdx()
            lig.AddAtom(obatom)
            # ids of all neighbours of obatom
            neighbours[idx] = set([neighbour_atom.GetIdx() for neighbour_atom
                                   in pybel.ob.OBAtomAtomIter(obatom)]) & set(hetatoms.keys())
        logger.debug(f'atom neighbours mapped')

        ##############################################################
        # map the old atom idx of OBMol to the new idx of the ligand #
        ##############################################################

        newidx = dict(zip(hetatoms.keys(), [obatom.GetIdx() for obatom in pybel.ob.OBMolAtomIter(lig)]))
        mapold = dict(zip(newidx.values(), newidx))
        # copy the bonds
        for obatom in hetatoms:
            for neighbour_atom in neighbours[obatom]:
                bond = hetatoms[obatom].GetBond(hetatoms[neighbour_atom])
                lig.AddBond(newidx[obatom], newidx[neighbour_atom], bond.GetBondOrder())
        lig = pybel.Molecule(lig)

        # For kmers, the representative ids are chosen (first residue of kmer)
        lig.data.update({'Name': rname, 'Chain': rchain, 'ResNr': rnum})

        # Check if a negative residue number is represented as a 32 bit integer
        if rnum > 10 ** 5:
            rnum = int32_to_negative(rnum)

        lig.title = ':'.join((rname, rchain, str(rnum)))
        self.mapper.ligandmaps[lig.title] = mapold

        logger.debug('renumerated molecule generated')

        if not config.NOPDBCANMAP:
            atomorder = canonicalize(lig)
        else:
            atomorder = None

        can_to_pdb = {}
        if atomorder is not None:
            can_to_pdb = {atomorder[key - 1]: mapold[key] for key in mapold}

        ligand = data(mol=lig, hetid=rname, chain=rchain, position=rnum, water=self.water,
                      members=members, longname=longname, type=ligtype, atomorder=atomorder,
                      can_to_pdb=can_to_pdb)
        return ligand

    def is_het_residue(self, obres):
        """Given an OBResidue, determines if the residue is indeed a possible ligand
        in the PDB file"""
        if not obres.GetResidueProperty(0):
            # If the residue is NOT amino (0)
            # It can be amino_nucleo, coenzme, ion, nucleo, protein, purine, pyrimidine, solvent
            # In these cases, it is a ligand candidate
            return True
        else:
            # Here, the residue is classified as amino
            # Amino acids can still be ligands, so we check for HETATM entries
            # Only residues with at least one HETATM entry are processed as ligands
            het_atoms = []
            for atm in pybel.ob.OBResidueAtomIter(obres):
                het_atoms.append(obres.IsHetAtom(atm))
            if True in het_atoms:
                return True
        return False

    def filter_for_ligands(self):
        """Given an OpenBabel Molecule, get all ligands, their names, and water"""

        candidates1 = [o for o in pybel.ob.OBResidueIter(
            self.proteincomplex.OBMol) if not o.GetResidueProperty(9) and self.is_het_residue(o)]

        if config.DNARECEPTOR:  # If DNA is the receptor, don't consider DNA as a ligand
            candidates1 = [res for res in candidates1 if res.GetName() not in config.DNA + config.RNA]
        all_lignames = set([a.GetName() for a in candidates1])

        water = [o for o in pybel.ob.OBResidueIter(self.proteincomplex.OBMol) if o.GetResidueProperty(9)]
        # Filter out non-ligands
        if not config.KEEPMOD:  # Keep modified residues as ligands
            candidates2 = [a for a in candidates1 if is_lig(a.GetName()) and a.GetName() not in self.modresidues]
        else:
            candidates2 = [a for a in candidates1 if is_lig(a.GetName())]
        logger.debug(f'{len(candidates2)} ligand(s) after first filtering step')

        ############################################
        # Filtering by counting and artifacts list #
        ############################################
        artifacts = []
        unique_ligs = set(a.GetName() for a in candidates2)
        for ulig in unique_ligs:
            # Discard if appearing 15 times or more and is possible artifact
            if ulig in config.biolip_list and [a.GetName() for a in candidates2].count(ulig) >= 15:
                artifacts.append(ulig)

        selected_ligands = [a for a in candidates2 if a.GetName() not in artifacts]

        return selected_ligands, all_lignames, water

    def identify_kmers(self, residues):
        """Using the covalent linkage information, find out which fragments/subunits form a ligand."""

        # Remove all those not considered by ligands and pairings including alternate conformations
        ligdoubles = [[(link.id1, link.chain1, link.pos1),
                       (link.id2, link.chain2, link.pos2)] for link in
                      [c for c in self.covalent if c.id1 in self.lignames_kept and c.id2 in self.lignames_kept
                       and c.conf1 in ['A', ''] and c.conf2 in ['A', '']
                       and (c.id1, c.chain1, c.pos1) in residues
                       and (c.id2, c.chain2, c.pos2) in residues]]
        kmers = cluster_doubles(ligdoubles)
        if not kmers:  # No ligand kmers, just normal independent ligands
            return [[residues[res]] for res in residues]

        else:
            # res_kmers contains clusters of covalently bound ligand residues (kmer ligands)
            res_kmers = [[residues[res] for res in kmer] for kmer in kmers]

            # In this case, add other ligands which are not part of a kmer
            in_kmer = []
            for res_kmer in res_kmers:
                for res in res_kmer:
                    in_kmer.append((res.GetName(), res.GetChain(), res.GetNum()))
            for res in residues:
                if res not in in_kmer:
                    newres = [residues[res], ]
                    res_kmers.append(newres)
            return res_kmers


class Mapper:
    """Provides functions for mapping atom IDs in the correct way"""

    def __init__(self):
        self.proteinmap = None  # Map internal atom IDs of protein residues to original PDB Atom IDs
        self.ligandmaps = {}  # Map IDs of new ligand molecules to internal IDs (or PDB IDs?)
        self.original_structure = None

    def mapid(self, idx, mtype, bsid=None, to='original'):  # Mapping to original IDs is standard for ligands
        if mtype == 'reversed':  # Needed to map internal ID back to original protein ID
            return self.reversed_proteinmap[idx]
        if mtype == 'protein':
            return self.proteinmap[idx]
        elif mtype == 'ligand':
            if to == 'internal':
                return self.ligandmaps[bsid][idx]
            elif to == 'original':
                return self.proteinmap[self.ligandmaps[bsid][idx]]

    def id_to_atom(self, idx):
        """Returns the atom for a given original ligand ID.
        To do this, the ID is mapped to the protein first and then the atom returned.
        """
        mapped_idx = self.mapid(idx, 'reversed')
        return pybel.Atom(self.original_structure.GetAtom(mapped_idx))


class Mol:
    def __init__(self, altconf, mapper, mtype, bsid):
        self.mtype = mtype
        self.bsid = bsid
        self.rings = None
        self.hydroph_atoms = None
        self.charged = None
        self.hbond_don_atom_pairs = None
        self.hbond_acc_atoms = None
        self.altconf = altconf
        self.Mapper = mapper

    def hydrophobic_atoms(self, all_atoms):
        """Select all carbon atoms which have only carbons and/or hydrogens as direct neighbors."""
        atom_set = []
        data = namedtuple('hydrophobic', 'atom orig_atom orig_idx')
        atm = [a for a in all_atoms if a.atomicnum == 6 and set([natom.GetAtomicNum() for natom
                                                                 in pybel.ob.OBAtomAtomIter(a.OBAtom)]).issubset(
            {1, 6})]
        for atom in atm:
            orig_idx = self.Mapper.mapid(atom.idx, mtype=self.mtype, bsid=self.bsid)
            orig_atom = self.Mapper.id_to_atom(orig_idx)
            if atom.idx not in self.altconf:
                atom_set.append(data(atom=atom, orig_atom=orig_atom, orig_idx=orig_idx))
        return atom_set

    def find_hba(self, all_atoms):
        """Find all possible hydrogen bond acceptors"""
        data = namedtuple('hbondacceptor', 'a a_orig_atom a_orig_idx type')
        a_set = []
        for atom in filter(lambda at: at.OBAtom.IsHbondAcceptor(), all_atoms):
            if atom.atomicnum not in [9, 17, 35, 53] and atom.idx not in self.altconf:  # Exclude halogen atoms
                a_orig_idx = self.Mapper.mapid(atom.idx, mtype=self.mtype, bsid=self.bsid)
                a_orig_atom = self.Mapper.id_to_atom(a_orig_idx)
                a_set.append(data(a=atom, a_orig_atom=a_orig_atom, a_orig_idx=a_orig_idx, type='regular'))
        a_set = sorted(a_set, key=lambda x: x.a_orig_idx)
        return a_set

    def find_hbd(self, all_atoms, hydroph_atoms):
        """Find all possible strong and weak hydrogen bonds donors (all hydrophobic C-H pairings)"""
        donor_pairs = []
        data = namedtuple('hbonddonor', 'd d_orig_atom d_orig_idx h type')
        for donor in [a for a in all_atoms if a.OBAtom.IsHbondDonor() and a.idx not in self.altconf]:
            in_ring = False
            if not in_ring:
                for adj_atom in [a for a in pybel.ob.OBAtomAtomIter(donor.OBAtom) if a.IsHbondDonorH()]:
                    d_orig_idx = self.Mapper.mapid(donor.idx, mtype=self.mtype, bsid=self.bsid)
                    d_orig_atom = self.Mapper.id_to_atom(d_orig_idx)
                    donor_pairs.append(data(d=donor, d_orig_atom=d_orig_atom, d_orig_idx=d_orig_idx,
                                            h=pybel.Atom(adj_atom), type='regular'))
        for carbon in hydroph_atoms:
            for adj_atom in [a for a in pybel.ob.OBAtomAtomIter(carbon.atom.OBAtom) if a.GetAtomicNum() == 1]:
                d_orig_idx = self.Mapper.mapid(carbon.atom.idx, mtype=self.mtype, bsid=self.bsid)
                d_orig_atom = self.Mapper.id_to_atom(d_orig_idx)
                donor_pairs.append(data(d=carbon, d_orig_atom=d_orig_atom,
                                        d_orig_idx=d_orig_idx, h=pybel.Atom(adj_atom), type='weak'))
        donor_pairs = sorted(donor_pairs, key=lambda x: (x.d_orig_idx, x.h.idx))
        return donor_pairs

    def find_rings(self, mol, all_atoms):
        """Find rings and return only aromatic.
        Rings have to be sufficiently planar OR be detected by OpenBabel as aromatic."""
        data = namedtuple('aromatic_ring', 'atoms orig_atoms atoms_orig_idx normal obj center type')
        rings = []
        aromatic_amino = ['TYR', 'TRP', 'HIS', 'PHE']
        ring_candidates = mol.OBMol.GetSSSR()
        logger.debug(f'number of aromatic ring candidates: {len(ring_candidates)}')
        # Check here first for ligand rings not being detected as aromatic by Babel and check for planarity
        for ring in ring_candidates:
            r_atoms = [a for a in all_atoms if ring.IsMember(a.OBAtom)]
            r_atoms = sorted(r_atoms, key=lambda x: x.idx)
            if 4 < len(r_atoms) <= 6:
                res = list(set([whichrestype(a) for a in r_atoms]))
                # re-sort ring atoms for only ligands, because HETATM numbering is not canonical in OpenBabel
                if res[0] == 'UNL':
                    ligand_orig_idx = [self.Mapper.ligandmaps[self.bsid][a.idx] for a in r_atoms]
                    sort_order = np.argsort(np.array(ligand_orig_idx))
                    r_atoms = [r_atoms[i] for i in sort_order]
                if ring.IsAromatic() or res[0] in aromatic_amino or ring_is_planar(ring, r_atoms):
                    # Causes segfault with OpenBabel 2.3.2, so deactivated
                    # typ = ring.GetType() if not ring.GetType() == '' else 'unknown'
                    # Alternative typing
                    ring_type = '%s-membered' % len(r_atoms)
                    ring_atms = [r_atoms[a].coords for a in [0, 2, 4]]  # Probe atoms for normals, assuming planarity
                    ringv1 = vector(ring_atms[0], ring_atms[1])
                    ringv2 = vector(ring_atms[2], ring_atms[0])
                    atoms_orig_idx = [self.Mapper.mapid(r_atom.idx, mtype=self.mtype,
                                                        bsid=self.bsid) for r_atom in r_atoms]
                    orig_atoms = [self.Mapper.id_to_atom(idx) for idx in atoms_orig_idx]
                    rings.append(data(atoms=r_atoms,
                                      orig_atoms=orig_atoms,
                                      atoms_orig_idx=atoms_orig_idx,
                                      normal=normalize_vector(np.cross(ringv1, ringv2)),
                                      obj=ring,
                                      center=centroid([ra.coords for ra in r_atoms]),
                                      type=ring_type))
        return rings

    def get_hydrophobic_atoms(self):
        return self.hydroph_atoms

    def get_hba(self):
        return self.hbond_acc_atoms

    def get_hbd(self):
        return [don_pair for don_pair in self.hbond_don_atom_pairs if don_pair.type == 'regular']

    def get_weak_hbd(self):
        return [don_pair for don_pair in self.hbond_don_atom_pairs if don_pair.type == 'weak']

    def get_pos_charged(self):
        return [charge for charge in self.charged if charge.type == 'positive']

    def get_neg_charged(self):
        return [charge for charge in self.charged if charge.type == 'negative']


class PLInteraction:
    """Class to store a ligand, a protein and their interactions."""

    def __init__(self, lig_obj, bs_obj, protcomplex):
        """Detect all interactions when initializing"""
        self.ligand = lig_obj
        self.lig_members = lig_obj.members
        self.pdbid = protcomplex.pymol_name
        self.bindingsite = bs_obj
        self.Mapper = protcomplex.Mapper
        self.output_path = protcomplex.output_path
        self.altconf = protcomplex.altconf
        # #@todo Refactor code to combine different directionality

        self.saltbridge_lneg = saltbridge(self.bindingsite.get_pos_charged(), self.ligand.get_neg_charged(), True)
        self.saltbridge_pneg = saltbridge(self.ligand.get_pos_charged(), self.bindingsite.get_neg_charged(), False)

        self.all_hbonds_ldon = hbonds(self.bindingsite.get_hba(),
                                      self.ligand.get_hbd(), False, 'strong')
        self.all_hbonds_pdon = hbonds(self.ligand.get_hba(),
                                      self.bindingsite.get_hbd(), True, 'strong')

        self.hbonds_ldon = self.refine_hbonds_ldon(self.all_hbonds_ldon, self.saltbridge_lneg,
                                                   self.saltbridge_pneg)
        self.hbonds_pdon = self.refine_hbonds_pdon(self.all_hbonds_pdon, self.saltbridge_lneg,
                                                   self.saltbridge_pneg)

        self.pistacking = pistacking(self.bindingsite.rings, self.ligand.rings)

        self.all_pi_cation_laro = pication(self.ligand.rings, self.bindingsite.get_pos_charged(), True)
        self.pication_paro = pication(self.bindingsite.rings, self.ligand.get_pos_charged(), False)

        self.pication_laro = self.refine_pi_cation_laro(self.all_pi_cation_laro, self.pistacking)

        self.all_hydrophobic_contacts = hydrophobic_interactions(self.bindingsite.get_hydrophobic_atoms(),
                                                                 self.ligand.get_hydrophobic_atoms())
        self.hydrophobic_contacts = self.refine_hydrophobic(self.all_hydrophobic_contacts, self.pistacking)
        self.halogen_bonds = halogen(self.bindingsite.halogenbond_acc, self.ligand.halogenbond_don)
        self.water_bridges = water_bridges(self.bindingsite.get_hba(), self.ligand.get_hba(),
                                           self.bindingsite.get_hbd(), self.ligand.get_hbd(),
                                           self.ligand.water)

        self.water_bridges = self.refine_water_bridges(self.water_bridges, self.hbonds_ldon, self.hbonds_pdon)

        self.metal_complexes = metal_complexation(self.ligand.metals, self.ligand.metal_binding,
                                                  self.bindingsite.metal_binding)

        self.all_itypes = self.saltbridge_lneg + self.saltbridge_pneg + self.hbonds_pdon
        self.all_itypes = self.all_itypes + self.hbonds_ldon + self.pistacking + self.pication_laro + self.pication_paro
        self.all_itypes = self.all_itypes + self.hydrophobic_contacts + self.halogen_bonds + self.water_bridges
        self.all_itypes = self.all_itypes + self.metal_complexes

        self.no_interactions = all(len(i) == 0 for i in self.all_itypes)
        self.unpaired_hba, self.unpaired_hbd, self.unpaired_hal = self.find_unpaired_ligand()
        self.unpaired_hba_orig_idx = [self.Mapper.mapid(atom.idx, mtype='ligand', bsid=self.ligand.bsid)
                                      for atom in self.unpaired_hba]
        self.unpaired_hbd_orig_idx = [self.Mapper.mapid(atom.idx, mtype='ligand', bsid=self.ligand.bsid)
                                      for atom in self.unpaired_hbd]
        self.unpaired_hal_orig_idx = [self.Mapper.mapid(atom.idx, mtype='ligand', bsid=self.ligand.bsid)
                                      for atom in self.unpaired_hal]
        self.num_unpaired_hba, self.num_unpaired_hbd = len(self.unpaired_hba), len(self.unpaired_hbd)
        self.num_unpaired_hal = len(self.unpaired_hal)

        # Exclude empty chains (coming from ligand as a target, from metal complexes)
        self.interacting_chains = sorted(list(set([i.reschain for i in self.all_itypes
                                                   if i.reschain not in [' ', None]])))

        # Get all interacting residues, excluding ligand and water molecules
        self.interacting_res = list(set([''.join([str(i.resnr), str(i.reschain)]) for i in self.all_itypes
                                         if i.restype not in ['LIG', 'HOH']]))
        if len(self.interacting_res) != 0:
            logger.info(
                f'ligand interacts with {len(self.interacting_res)} binding site residue(s) in chain(s) {self.interacting_chains}')
            interactions_list = []
            num_saltbridges = len(self.saltbridge_lneg + self.saltbridge_pneg)
            num_hbonds = len(self.hbonds_ldon + self.hbonds_pdon)
            num_pication = len(self.pication_laro + self.pication_paro)
            num_pistack = len(self.pistacking)
            num_halogen = len(self.halogen_bonds)
            num_waterbridges = len(self.water_bridges)
            if num_saltbridges != 0:
                interactions_list.append('%i salt bridge(s)' % num_saltbridges)
            if num_hbonds != 0:
                interactions_list.append('%i hydrogen bond(s)' % num_hbonds)
            if num_pication != 0:
                interactions_list.append('%i pi-cation interaction(s)' % num_pication)
            if num_pistack != 0:
                interactions_list.append('%i pi-stacking(s)' % num_pistack)
            if num_halogen != 0:
                interactions_list.append('%i halogen bond(s)' % num_halogen)
            if num_waterbridges != 0:
                interactions_list.append('%i water bridge(s)' % num_waterbridges)
            if not len(interactions_list) == 0:
                logger.info(f'complex uses {interactions_list}')
        else:
            logger.info('no interactions for this ligand')

    def find_unpaired_ligand(self):
        """Identify unpaired functional in groups in ligands, involving H-Bond donors, acceptors, halogen bond donors.
        """
        unpaired_hba, unpaired_hbd, unpaired_hal = [], [], []
        # Unpaired hydrogen bond acceptors/donors in ligand (not used for hydrogen bonds/water, salt bridges/mcomplex)
        involved_atoms = [hbond.a.idx for hbond in self.hbonds_pdon] + [hbond.d.idx for hbond in self.hbonds_ldon]
        [[involved_atoms.append(atom.idx) for atom in sb.negative.atoms] for sb in self.saltbridge_lneg]
        [[involved_atoms.append(atom.idx) for atom in sb.positive.atoms] for sb in self.saltbridge_pneg]
        [involved_atoms.append(wb.a.idx) for wb in self.water_bridges if wb.protisdon]
        [involved_atoms.append(wb.d.idx) for wb in self.water_bridges if not wb.protisdon]
        [involved_atoms.append(mcomplex.target.atom.idx) for mcomplex in self.metal_complexes
         if mcomplex.location == 'ligand']
        for atom in [hba.a for hba in self.ligand.get_hba()]:
            if atom.idx not in involved_atoms:
                unpaired_hba.append(atom)
        for atom in [hbd.d for hbd in self.ligand.get_hbd()]:
            if atom.idx not in involved_atoms:
                unpaired_hbd.append(atom)

        # unpaired halogen bond donors in ligand (not used for the previous + halogen bonds)
        [involved_atoms.append(atom.don.x.idx) for atom in self.halogen_bonds]
        for atom in [haldon.x for haldon in self.ligand.halogenbond_don]:
            if atom.idx not in involved_atoms:
                unpaired_hal.append(atom)

        return unpaired_hba, unpaired_hbd, unpaired_hal

    def refine_hydrophobic(self, all_h, pistacks):
        """Apply several rules to reduce the number of hydrophobic interactions."""
        sel = {}
        #  1. Rings interacting via stacking can't have additional hydrophobic contacts between each other.
        for pistack, h in itertools.product(pistacks, all_h):
            h1, h2 = h.bsatom.idx, h.ligatom.idx
            brs, lrs = [p1.idx for p1 in pistack.proteinring.atoms], [p2.idx for p2 in pistack.ligandring.atoms]
            if h1 in brs and h2 in lrs:
                sel[(h1, h2)] = "EXCLUDE"
        hydroph = [h for h in all_h if not (h.bsatom.idx, h.ligatom.idx) in sel]
        sel2 = {}
        #  2. If a ligand atom interacts with several binding site atoms in the same residue,
        #  keep only the one with the closest distance
        for h in hydroph:
            if not (h.ligatom.idx, h.resnr) in sel2:
                sel2[(h.ligatom.idx, h.resnr)] = h
            else:
                if sel2[(h.ligatom.idx, h.resnr)].distance > h.distance:
                    sel2[(h.ligatom.idx, h.resnr)] = h
        hydroph = [h for h in sel2.values()]
        hydroph_final = []
        bsclust = {}
        #  3. If a protein atom interacts with several neighboring ligand atoms, just keep the one with the closest dist
        for h in hydroph:
            if h.bsatom.idx not in bsclust:
                bsclust[h.bsatom.idx] = [h, ]
            else:
                bsclust[h.bsatom.idx].append(h)

        idx_to_h = {}
        for bs in [a for a in bsclust if len(bsclust[a]) == 1]:
            hydroph_final.append(bsclust[bs][0])

        # A list of tuples with the idx of an atom and one of its neighbours is created
        for bs in [a for a in bsclust if not len(bsclust[a]) == 1]:
            tuples = []
            all_idx = [i.ligatom.idx for i in bsclust[bs]]
            for b in bsclust[bs]:
                idx = b.ligatom.idx
                neigh = [na for na in pybel.ob.OBAtomAtomIter(b.ligatom.OBAtom)]
                for n in neigh:
                    n_idx = n.GetIdx()
                    if n_idx in all_idx:
                        if n_idx < idx:
                            tuples.append((n_idx, idx))
                        else:
                            tuples.append((idx, n_idx))
                        idx_to_h[idx] = b

            tuples = list(set(tuples))
            tuples = sorted(tuples, key=itemgetter(1))
            clusters = cluster_doubles(tuples)  # Cluster connected atoms (i.e. find hydrophobic patches)

            for cluster in clusters:
                min_dist = float('inf')
                min_h = None
                for atm_idx in cluster:
                    h = idx_to_h[atm_idx]
                    if h.distance < min_dist:
                        min_dist = h.distance
                        min_h = h
                hydroph_final.append(min_h)
        before, reduced = len(all_h), len(hydroph_final)
        if not before == 0 and not before == reduced:
            logger.info(f'reduced number of hydrophobic contacts from {before} to {reduced}')
        return hydroph_final

    def refine_hbonds_ldon(self, all_hbonds, salt_lneg, salt_pneg):
        """Refine selection of hydrogen bonds. Do not allow groups which already form salt bridges to form H-Bonds."""
        i_set = {}
        for hbond in all_hbonds:
            i_set[hbond] = False
            for salt in salt_pneg:
                protidx, ligidx = [at.idx for at in salt.negative.atoms], [at.idx for at in salt.positive.atoms]
                if hbond.d.idx in ligidx and hbond.a.idx in protidx:
                    i_set[hbond] = True
            for salt in salt_lneg:
                protidx, ligidx = [at.idx for at in salt.positive.atoms], [at.idx for at in salt.negative.atoms]
                if hbond.d.idx in ligidx and hbond.a.idx in protidx:
                    i_set[hbond] = True

        # Allow only one hydrogen bond per donor, select interaction with larger donor angle
        second_set = {}
        hbls = [k for k in i_set.keys() if not i_set[k]]
        for hbl in hbls:
            if hbl.d.idx not in second_set:
                second_set[hbl.d.idx] = (hbl.angle, hbl)
            else:
                if second_set[hbl.d.idx][0] < hbl.angle:
                    second_set[hbl.d.idx] = (hbl.angle, hbl)
        return [hb[1] for hb in second_set.values()]

    def refine_hbonds_pdon(self, all_hbonds, salt_lneg, salt_pneg):
        """Refine selection of hydrogen bonds. Do not allow groups which already form salt bridges to form H-Bonds with
        atoms of the same group.
        """
        i_set = {}
        for hbond in all_hbonds:
            i_set[hbond] = False
            for salt in salt_lneg:
                protidx, ligidx = [at.idx for at in salt.positive.atoms], [at.idx for at in salt.negative.atoms]
                if hbond.a.idx in ligidx and hbond.d.idx in protidx:
                    i_set[hbond] = True
            for salt in salt_pneg:
                protidx, ligidx = [at.idx for at in salt.negative.atoms], [at.idx for at in salt.positive.atoms]
                if hbond.a.idx in ligidx and hbond.d.idx in protidx:
                    i_set[hbond] = True

        # Allow only one hydrogen bond per donor, select interaction with larger donor angle
        second_set = {}
        hbps = [k for k in i_set.keys() if not i_set[k]]
        for hbp in hbps:
            if hbp.d.idx not in second_set:
                second_set[hbp.d.idx] = (hbp.angle, hbp)
            else:
                if second_set[hbp.d.idx][0] < hbp.angle:
                    second_set[hbp.d.idx] = (hbp.angle, hbp)
        return [hb[1] for hb in second_set.values()]

    def refine_pi_cation_laro(self, all_picat, stacks):
        """Just important for constellations with histidine involved. If the histidine ring is positioned in stacking
        position to an aromatic ring in the ligand, there is in most cases stacking and pi-cation interaction reported
        as histidine also carries a positive charge in the ring. For such cases, only report stacking.
        """
        i_set = []
        for picat in all_picat:
            exclude = False
            for stack in stacks:
                if whichrestype(stack.proteinring.atoms[0]) == 'HIS' and picat.ring.obj == stack.ligandring.obj:
                    exclude = True
            if not exclude:
                i_set.append(picat)
        return i_set

    def refine_water_bridges(self, wbridges, hbonds_ldon, hbonds_pdon):
        """A donor atom already forming a hydrogen bond is not allowed to form a water bridge. Each water molecule
        can only be donor for two water bridges, selecting the constellation with the omega angle closest to 110 deg."""
        donor_atoms_hbonds = [hb.d.idx for hb in hbonds_ldon + hbonds_pdon]
        wb_dict = {}
        wb_dict2 = {}
        omega = 110.0

        # Just one hydrogen bond per donor atom
        for wbridge in [wb for wb in wbridges if wb.d.idx not in donor_atoms_hbonds]:
            if (wbridge.water.idx, wbridge.a.idx) not in wb_dict:
                wb_dict[(wbridge.water.idx, wbridge.a.idx)] = wbridge
            else:
                if abs(omega - wb_dict[(wbridge.water.idx, wbridge.a.idx)].w_angle) < abs(omega - wbridge.w_angle):
                    wb_dict[(wbridge.water.idx, wbridge.a.idx)] = wbridge
        for wb_tuple in wb_dict:
            water, acceptor = wb_tuple
            if water not in wb_dict2:
                wb_dict2[water] = [(abs(omega - wb_dict[wb_tuple].w_angle), wb_dict[wb_tuple]), ]
            elif len(wb_dict2[water]) == 1:
                wb_dict2[water].append((abs(omega - wb_dict[wb_tuple].w_angle), wb_dict[wb_tuple]))
                wb_dict2[water] = sorted(wb_dict2[water], key=lambda x: x[0])
            else:
                if wb_dict2[water][1][0] < abs(omega - wb_dict[wb_tuple].w_angle):
                    wb_dict2[water] = [wb_dict2[water][0], (wb_dict[wb_tuple].w_angle, wb_dict[wb_tuple])]

        filtered_wb = []
        for fwbridges in wb_dict2.values():
            [filtered_wb.append(fwb[1]) for fwb in fwbridges]
        return filtered_wb


class BindingSite(Mol):
    def __init__(self, atoms, protcomplex, cclass, altconf, min_dist, mapper):
        """Find all relevant parts which could take part in interactions"""
        Mol.__init__(self, altconf, mapper, mtype='protein', bsid=None)
        self.complex = cclass
        self.full_mol = protcomplex
        self.all_atoms = atoms
        self.min_dist = min_dist  # Minimum distance of bs res to ligand
        self.bs_res = list(set([''.join([str(whichresnumber(a)), whichchain(a)]) for a in self.all_atoms]))  # e.g. 47A
        self.rings = self.find_rings(self.full_mol, self.all_atoms)
        self.hydroph_atoms = self.hydrophobic_atoms(self.all_atoms)
        self.hbond_acc_atoms = self.find_hba(self.all_atoms)
        self.hbond_don_atom_pairs = self.find_hbd(self.all_atoms, self.hydroph_atoms)
        self.charged = self.find_charged(self.full_mol)
        self.halogenbond_acc = self.find_hal(self.all_atoms)
        self.metal_binding = self.find_metal_binding(self.full_mol)

    def find_hal(self, atoms):
        """Look for halogen bond acceptors (Y-{O|P|N|S}, with Y=C,P,S)"""
        data = namedtuple('hal_acceptor', 'o o_orig_idx y y_orig_idx')
        a_set = []
        # All oxygens, nitrogen, sulfurs with neighboring carbon, phosphor, nitrogen or sulfur
        for a in [at for at in atoms if at.atomicnum in [8, 7, 16]]:
            n_atoms = [na for na in pybel.ob.OBAtomAtomIter(a.OBAtom) if na.GetAtomicNum() in [6, 7, 15, 16]]
            if len(n_atoms) == 1:  # Proximal atom
                o_orig_idx = self.Mapper.mapid(a.idx, mtype=self.mtype, bsid=self.bsid)
                y_orig_idx = self.Mapper.mapid(n_atoms[0].GetIdx(), mtype=self.mtype, bsid=self.bsid)
                a_set.append(data(o=a, o_orig_idx=o_orig_idx, y=pybel.Atom(n_atoms[0]), y_orig_idx=y_orig_idx))
        return a_set

    def find_charged(self, mol):
        """Looks for positive charges in arginine, histidine or lysine, for negative in aspartic and glutamic acid."""
        data = namedtuple('pcharge', 'atoms atoms_orig_idx type center restype resnr reschain')
        a_set = []
        # Iterate through all residue, exclude those in chains defined as peptides
        for res in [r for r in pybel.ob.OBResidueIter(mol.OBMol) if not r.GetChain() in config.PEPTIDES]:
            if config.INTRA is not None:
                if res.GetChain() != config.INTRA:
                    continue
            a_contributing = []
            a_contributing_orig_idx = []
            if res.GetName() in ('ARG', 'HIS', 'LYS'):  # Arginine, Histidine or Lysine have charged sidechains
                for a in pybel.ob.OBResidueAtomIter(res):
                    if a.GetType().startswith('N') and res.GetAtomProperty(a, 8) \
                            and not self.Mapper.mapid(a.GetIdx(), mtype='protein') in self.altconf:
                        a_contributing.append(pybel.Atom(a))
                        a_contributing_orig_idx.append(self.Mapper.mapid(a.GetIdx(), mtype='protein'))
                if not len(a_contributing) == 0:
                    a_set.append(data(atoms=a_contributing,
                                      atoms_orig_idx=a_contributing_orig_idx,
                                      type='positive',
                                      center=centroid([ac.coords for ac in a_contributing]),
                                      restype=res.GetName(),
                                      resnr=res.GetNum(),
                                      reschain=res.GetChain()))
            if res.GetName() in ('GLU', 'ASP'):  # Aspartic or Glutamic Acid
                for a in pybel.ob.OBResidueAtomIter(res):
                    if a.GetType().startswith('O') and res.GetAtomProperty(a, 8) \
                            and not self.Mapper.mapid(a.GetIdx(), mtype='protein') in self.altconf:
                        a_contributing.append(pybel.Atom(a))
                        a_contributing_orig_idx.append(self.Mapper.mapid(a.GetIdx(), mtype='protein'))
                if not len(a_contributing) == 0:
                    a_set.append(data(atoms=a_contributing,
                                      atoms_orig_idx=a_contributing_orig_idx,
                                      type='negative',
                                      center=centroid([ac.coords for ac in a_contributing]),
                                      restype=res.GetName(),
                                      resnr=res.GetNum(),
                                      reschain=res.GetChain()))
        return a_set

    def find_metal_binding(self, mol):
        """Looks for atoms that could possibly be involved in chelating a metal ion.
        This can be any main chain oxygen atom or oxygen, nitrogen and sulfur from specific amino acids"""
        data = namedtuple('metal_binding', 'atom atom_orig_idx type restype resnr reschain location')
        a_set = []
        for res in pybel.ob.OBResidueIter(mol.OBMol):
            restype, reschain, resnr = res.GetName().upper(), res.GetChain(), res.GetNum()
            if restype in ['ASP', 'GLU', 'SER', 'THR', 'TYR']:  # Look for oxygens here
                for a in pybel.ob.OBResidueAtomIter(res):
                    if a.GetType().startswith('O') and res.GetAtomProperty(a, 8) \
                            and not self.Mapper.mapid(a.GetIdx(), mtype='protein') in self.altconf:
                        atom_orig_idx = self.Mapper.mapid(a.GetIdx(), mtype=self.mtype, bsid=self.bsid)
                        a_set.append(data(atom=pybel.Atom(a), atom_orig_idx=atom_orig_idx, type='O', restype=restype,
                                          resnr=resnr, reschain=reschain,
                                          location='protein.sidechain'))
            if restype == 'HIS':  # Look for nitrogen here
                for a in pybel.ob.OBResidueAtomIter(res):
                    if a.GetType().startswith('N') and res.GetAtomProperty(a, 8) \
                            and not self.Mapper.mapid(a.GetIdx(), mtype='protein') in self.altconf:
                        atom_orig_idx = self.Mapper.mapid(a.GetIdx(), mtype=self.mtype, bsid=self.bsid)
                        a_set.append(data(atom=pybel.Atom(a), atom_orig_idx=atom_orig_idx, type='N', restype=restype,
                                          resnr=resnr, reschain=reschain,
                                          location='protein.sidechain'))
            if restype == 'CYS':  # Look for sulfur here
                for a in pybel.ob.OBResidueAtomIter(res):
                    if a.GetType().startswith('S') and res.GetAtomProperty(a, 8) \
                            and not self.Mapper.mapid(a.GetIdx(), mtype='protein') in self.altconf:
                        atom_orig_idx = self.Mapper.mapid(a.GetIdx(), mtype=self.mtype, bsid=self.bsid)
                        a_set.append(data(atom=pybel.Atom(a), atom_orig_idx=atom_orig_idx, type='S', restype=restype,
                                          resnr=resnr, reschain=reschain,
                                          location='protein.sidechain'))
            for a in pybel.ob.OBResidueAtomIter(res):  # All main chain oxygens
                if a.GetType().startswith('O') and res.GetAtomProperty(a, 2) \
                        and not self.Mapper.mapid(a.GetIdx(), mtype='protein') in self.altconf and restype != 'HOH':
                    atom_orig_idx = self.Mapper.mapid(a.GetIdx(), mtype=self.mtype, bsid=self.bsid)
                    a_set.append(data(atom=pybel.Atom(a), atom_orig_idx=atom_orig_idx, type='O', restype=res.GetName(),
                                      resnr=res.GetNum(), reschain=res.GetChain(),
                                      location='protein.mainchain'))
        return a_set


class Ligand(Mol):
    def __init__(self, cclass, ligand):
        altconf = cclass.altconf
        self.hetid, self.chain, self.position = ligand.hetid, ligand.chain, ligand.position
        self.bsid = ':'.join([self.hetid, self.chain, str(self.position)])
        Mol.__init__(self, altconf, cclass.Mapper, mtype='ligand', bsid=self.bsid)
        self.members = ligand.members
        self.longname = ligand.longname
        self.type = ligand.type
        self.complex = cclass
        self.molecule = ligand.mol  # Pybel Molecule
        self.smiles = self.molecule.write(format='can')  # SMILES String
        self.inchikey = self.molecule.write(format='inchikey')
        self.can_to_pdb = ligand.can_to_pdb
        if not len(self.smiles) == 0:
            self.smiles = self.smiles.split()[0]
        else:
            logger.warning(f'could not write SMILES for ligand {ligand}')
            self.smiles = ''
        self.heavy_atoms = self.molecule.OBMol.NumHvyAtoms()  # Heavy atoms count
        self.all_atoms = self.molecule.atoms
        self.atmdict = {l.idx: l for l in self.all_atoms}
        self.rings = self.find_rings(self.molecule, self.all_atoms)
        self.hydroph_atoms = self.hydrophobic_atoms(self.all_atoms)
        self.hbond_acc_atoms = self.find_hba(self.all_atoms)
        self.num_rings = len(self.rings)
        if self.num_rings != 0:
            logger.info(f'contains {self.num_rings} aromatic ring(s)')
        descvalues = self.molecule.calcdesc()
        self.molweight, self.logp = float(descvalues['MW']), float(descvalues['logP'])
        self.num_rot_bonds = int(self.molecule.OBMol.NumRotors())
        self.atomorder = ligand.atomorder

        ##########################################################
        # Special Case for hydrogen bond acceptor identification #
        ##########################################################

        self.inverse_mapping = {v: k for k, v in self.Mapper.ligandmaps[self.bsid].items()}
        self.pdb_to_idx_mapping = {v: k for k, v in self.Mapper.proteinmap.items()}
        self.hbond_don_atom_pairs = self.find_hbd(self.all_atoms, self.hydroph_atoms)

        ######
        donor_pairs = []
        data = namedtuple('hbonddonor', 'd d_orig_atom d_orig_idx h type')
        for donor in self.all_atoms:
            pdbidx = self.Mapper.mapid(donor.idx, mtype='ligand', bsid=self.bsid, to='original')
            d = cclass.atoms[self.pdb_to_idx_mapping[pdbidx]]
            if d.OBAtom.IsHbondDonor():
                for adj_atom in [a for a in pybel.ob.OBAtomAtomIter(d.OBAtom) if a.IsHbondDonorH()]:
                    d_orig_atom = self.Mapper.id_to_atom(pdbidx)
                    donor_pairs.append(data(d=donor, d_orig_atom=d_orig_atom, d_orig_idx=pdbidx,
                                            h=pybel.Atom(adj_atom), type='regular'))
        self.hbond_don_atom_pairs = donor_pairs
        #######

        self.charged = self.find_charged(self.all_atoms)
        self.centroid = centroid([a.coords for a in self.all_atoms])
        self.max_dist_to_center = max((euclidean3d(self.centroid, a.coords) for a in self.all_atoms))
        self.water = []
        data = namedtuple('water', 'oxy oxy_orig_idx')
        for hoh in ligand.water:
            oxy = None
            for at in pybel.ob.OBResidueAtomIter(hoh):
                if at.GetAtomicNum() == 8 and at.GetIdx() not in self.altconf:
                    oxy = pybel.Atom(at)
            # There are some cases where there is no oxygen in a water residue, ignore those
            if not set([at.GetAtomicNum() for at in pybel.ob.OBResidueAtomIter(hoh)]) == {1} and oxy is not None:
                if euclidean3d(self.centroid, oxy.coords) < self.max_dist_to_center + config.BS_DIST:
                    oxy_orig_idx = self.Mapper.mapid(oxy.idx, mtype='protein')
                    self.water.append(data(oxy=oxy, oxy_orig_idx=oxy_orig_idx))
        self.halogenbond_don = self.find_hal(self.all_atoms)
        self.metal_binding = self.find_metal_binding(self.all_atoms, self.water)
        self.metals = []
        data = namedtuple('metal', 'm orig_m m_orig_idx')
        for a in [a for a in self.all_atoms if a.type.upper() in config.METAL_IONS]:
            m_orig_idx = self.Mapper.mapid(a.idx, mtype=self.mtype, bsid=self.bsid)
            orig_m = self.Mapper.id_to_atom(m_orig_idx)
            self.metals.append(data(m=a, m_orig_idx=m_orig_idx, orig_m=orig_m))
        self.num_hba, self.num_hbd = len(self.hbond_acc_atoms), len(self.hbond_don_atom_pairs)
        self.num_hal = len(self.halogenbond_don)

    def get_canonical_num(self, atomnum):
        """Converts internal atom ID into canonical atom ID. Agrees with Canonical SMILES in XML."""
        return self.atomorder[atomnum - 1]

    def is_functional_group(self, atom, group):
        """Given a pybel atom, look up if it belongs to a function group"""
        n_atoms = [a_neighbor.GetAtomicNum() for a_neighbor in pybel.ob.OBAtomAtomIter(atom.OBAtom)]

        if group in ['quartamine', 'tertamine'] and atom.atomicnum == 7:  # Nitrogen
            # It's a nitrogen, so could be a protonated amine or quaternary ammonium
            if '1' not in n_atoms and len(n_atoms) == 4:
                return True if group == 'quartamine' else False  # It's a quat. ammonium (N with 4 residues != H)
            elif atom.OBAtom.GetHyb() == 3 and len(n_atoms) >= 3:
                return True if group == 'tertamine' else False  # It's sp3-hybridized, so could pick up an hydrogen
            else:
                return False

        if group in ['sulfonium', 'sulfonicacid', 'sulfate'] and atom.atomicnum == 16:  # Sulfur
            if '1' not in n_atoms and len(n_atoms) == 3:  # It's a sulfonium (S with 3 residues != H)
                return True if group == 'sulfonium' else False
            elif n_atoms.count(8) == 3:  # It's a sulfonate or sulfonic acid
                return True if group == 'sulfonicacid' else False
            elif n_atoms.count(8) == 4:  # It's a sulfate
                return True if group == 'sulfate' else False

        if group == 'phosphate' and atom.atomicnum == 15:  # Phosphor
            if set(n_atoms) == {8}:  # It's a phosphate
                return True

        if group in ['carboxylate', 'guanidine'] and atom.atomicnum == 6:  # It's a carbon atom
            if n_atoms.count(8) == 2 and n_atoms.count(6) == 1:  # It's a carboxylate group
                return True if group == 'carboxylate' else False
            elif n_atoms.count(7) == 3 and len(n_atoms) == 3:  # It's a guanidine group
                nitro_partners = []
                for nitro in pybel.ob.OBAtomAtomIter(atom.OBAtom):
                    nitro_partners.append(len([b_neighbor for b_neighbor in pybel.ob.OBAtomAtomIter(nitro)]))
                if min(nitro_partners) == 1:  # One nitrogen is only connected to the carbon, can pick up a H
                    return True if group == 'guanidine' else False

        if group == 'halocarbon' and atom.atomicnum in [9, 17, 35, 53]:  # Halogen atoms
            n_atoms = [na for na in pybel.ob.OBAtomAtomIter(atom.OBAtom) if na.GetAtomicNum() == 6]
            if len(n_atoms) == 1:  # Halocarbon
                return True
        else:
            return False

    def find_hal(self, atoms):
        """Look for halogen bond donors (X-C, with X=F, Cl, Br, I)"""
        data = namedtuple('hal_donor', 'x orig_x x_orig_idx c c_orig_idx')
        a_set = []
        for a in atoms:
            if self.is_functional_group(a, 'halocarbon'):
                n_atoms = [na for na in pybel.ob.OBAtomAtomIter(a.OBAtom) if na.GetAtomicNum() == 6]
                x_orig_idx = self.Mapper.mapid(a.idx, mtype=self.mtype, bsid=self.bsid)
                orig_x = self.Mapper.id_to_atom(x_orig_idx)
                c_orig_idx = [self.Mapper.mapid(na.GetIdx(), mtype=self.mtype, bsid=self.bsid) for na in n_atoms]
                a_set.append(data(x=a, orig_x=orig_x, x_orig_idx=x_orig_idx,
                                  c=pybel.Atom(n_atoms[0]), c_orig_idx=c_orig_idx))
        if len(a_set) != 0:
            logger.info(f'ligand contains {len(a_set)} halogen atom(s)')
        return a_set

    def find_charged(self, all_atoms):
        """Identify all positively charged groups in a ligand. This search is not exhaustive, as the cases can be quite
        diverse. The typical cases seem to be protonated amines, quaternary ammoinium and sulfonium
        as mentioned in 'Cation-pi interactions in ligand recognition and catalysis' (Zacharias et al., 2002)).
        Identify negatively charged groups in the ligand.
        """
        data = namedtuple('lcharge', 'atoms orig_atoms atoms_orig_idx type center fgroup')
        a_set = []
        for a in all_atoms:
            a_orig_idx = self.Mapper.mapid(a.idx, mtype=self.mtype, bsid=self.bsid)
            a_orig = self.Mapper.id_to_atom(a_orig_idx)
            if self.is_functional_group(a, 'quartamine'):
                a_set.append(data(atoms=[a, ], orig_atoms=[a_orig, ], atoms_orig_idx=[a_orig_idx, ], type='positive',
                                  center=list(a.coords), fgroup='quartamine'))
            elif self.is_functional_group(a, 'tertamine'):
                a_set.append(data(atoms=[a, ], orig_atoms=[a_orig, ], atoms_orig_idx=[a_orig_idx, ], type='positive',
                                  center=list(a.coords),
                                  fgroup='tertamine'))
            if self.is_functional_group(a, 'sulfonium'):
                a_set.append(data(atoms=[a, ], orig_atoms=[a_orig, ], atoms_orig_idx=[a_orig_idx, ], type='positive',
                                  center=list(a.coords),
                                  fgroup='sulfonium'))
            if self.is_functional_group(a, 'phosphate'):
                a_contributing = [a, ]
                a_contributing_orig_idx = [a_orig_idx, ]
                [a_contributing.append(pybel.Atom(neighbor)) for neighbor in pybel.ob.OBAtomAtomIter(a.OBAtom)]
                [a_contributing_orig_idx.append(self.Mapper.mapid(neighbor.idx, mtype=self.mtype, bsid=self.bsid))
                 for neighbor in a_contributing]
                orig_contributing = [self.Mapper.id_to_atom(idx) for idx in a_contributing_orig_idx]
                a_set.append(
                    data(atoms=a_contributing, orig_atoms=orig_contributing, atoms_orig_idx=a_contributing_orig_idx,
                         type='negative',
                         center=a.coords, fgroup='phosphate'))
            if self.is_functional_group(a, 'sulfonicacid'):
                a_contributing = [a, ]
                a_contributing_orig_idx = [a_orig_idx, ]
                [a_contributing.append(pybel.Atom(neighbor)) for neighbor in pybel.ob.OBAtomAtomIter(a.OBAtom) if
                 neighbor.GetAtomicNum() == 8]
                [a_contributing_orig_idx.append(self.Mapper.mapid(neighbor.idx, mtype=self.mtype, bsid=self.bsid))
                 for neighbor in a_contributing]
                orig_contributing = [self.Mapper.id_to_atom(idx) for idx in a_contributing_orig_idx]
                a_set.append(
                    data(atoms=a_contributing, orig_atoms=orig_contributing, atoms_orig_idx=a_contributing_orig_idx,
                         type='negative',
                         center=a.coords, fgroup='sulfonicacid'))
            elif self.is_functional_group(a, 'sulfate'):
                a_contributing = [a, ]
                a_contributing_orig_idx = [a_orig_idx, ]
                [a_contributing_orig_idx.append(self.Mapper.mapid(neighbor.idx, mtype=self.mtype, bsid=self.bsid))
                 for neighbor in a_contributing]
                [a_contributing.append(pybel.Atom(neighbor)) for neighbor in pybel.ob.OBAtomAtomIter(a.OBAtom)]
                orig_contributing = [self.Mapper.id_to_atom(idx) for idx in a_contributing_orig_idx]
                a_set.append(
                    data(atoms=a_contributing, orig_atoms=orig_contributing, atoms_orig_idx=a_contributing_orig_idx,
                         type='negative',
                         center=a.coords, fgroup='sulfate'))
            if self.is_functional_group(a, 'carboxylate'):
                a_contributing = [pybel.Atom(neighbor) for neighbor in pybel.ob.OBAtomAtomIter(a.OBAtom)
                                  if neighbor.GetAtomicNum() == 8]
                a_contributing_orig_idx = [self.Mapper.mapid(neighbor.idx, mtype=self.mtype, bsid=self.bsid)
                                           for neighbor in a_contributing]
                orig_contributing = [self.Mapper.id_to_atom(idx) for idx in a_contributing_orig_idx]
                a_set.append(
                    data(atoms=a_contributing, orig_atoms=orig_contributing, atoms_orig_idx=a_contributing_orig_idx,
                         type='negative',
                         center=centroid([a.coords for a in a_contributing]), fgroup='carboxylate'))
            elif self.is_functional_group(a, 'guanidine'):
                a_contributing = [pybel.Atom(neighbor) for neighbor in pybel.ob.OBAtomAtomIter(a.OBAtom)
                                  if neighbor.GetAtomicNum() == 7]
                a_contributing_orig_idx = [self.Mapper.mapid(neighbor.idx, mtype=self.mtype, bsid=self.bsid)
                                           for neighbor in a_contributing]
                orig_contributing = [self.Mapper.id_to_atom(idx) for idx in a_contributing_orig_idx]
                a_set.append(
                    data(atoms=a_contributing, orig_atoms=orig_contributing, atoms_orig_idx=a_contributing_orig_idx,
                         type='positive',
                         center=a.coords, fgroup='guanidine'))
        return a_set

    def find_metal_binding(self, lig_atoms, water_oxygens):
        """Looks for atoms that could possibly be involved in binding a metal ion.
        This can be any water oxygen, as well as oxygen from carboxylate, phophoryl, phenolate, alcohol;
        nitrogen from imidazole; sulfur from thiolate.
        """
        a_set = []
        data = namedtuple('metal_binding', 'atom orig_atom atom_orig_idx type fgroup restype resnr reschain location')
        for oxygen in water_oxygens:
            a_set.append(data(atom=oxygen.oxy, atom_orig_idx=oxygen.oxy_orig_idx, type='O', fgroup='water',
                              restype=whichrestype(oxygen.oxy), resnr=whichresnumber(oxygen.oxy),
                              reschain=whichchain(oxygen.oxy), location='water',
                              orig_atom=self.Mapper.id_to_atom(oxygen.oxy_orig_idx)))
        # #@todo Refactor code
        for a in lig_atoms:
            a_orig_idx = self.Mapper.mapid(a.idx, mtype='ligand', bsid=self.bsid)
            n_atoms = pybel.ob.OBAtomAtomIter(a.OBAtom)  # Neighboring atoms
            # All atomic numbers of neighboring atoms
            n_atoms_atomicnum = [n.GetAtomicNum() for n in pybel.ob.OBAtomAtomIter(a.OBAtom)]
            if a.atomicnum == 8:  # Oxygen
                if n_atoms_atomicnum.count('1') == 1 and len(n_atoms_atomicnum) == 2:  # Oxygen in alcohol (R-[O]-H)
                    a_set.append(data(atom=a, atom_orig_idx=a_orig_idx, type='O', fgroup='alcohol',
                                      restype=self.hetid, resnr=self.position, reschain=self.chain,
                                      location='ligand', orig_atom=self.Mapper.id_to_atom(a_orig_idx)))
                if True in [n.IsAromatic() for n in n_atoms] and not a.OBAtom.IsAromatic():  # Phenolate oxygen
                    a_set.append(data(atom=a, atom_orig_idx=a_orig_idx, type='O', fgroup='phenolate',
                                      restype=self.hetid, resnr=self.position, reschain=self.chain,
                                      location='ligand', orig_atom=self.Mapper.id_to_atom(a_orig_idx)))
            if a.atomicnum == 6:  # It's a carbon atom
                if n_atoms_atomicnum.count(8) == 2 and n_atoms_atomicnum.count(6) == 1:  # It's a carboxylate group
                    for neighbor in [n for n in n_atoms if n.GetAtomicNum() == 8]:
                        neighbor_orig_idx = self.Mapper.mapid(neighbor.GetIdx(), mtype='ligand', bsid=self.bsid)
                        a_set.append(data(atom=pybel.Atom(neighbor), atom_orig_idx=neighbor_orig_idx, type='O',
                                          fgroup='carboxylate',
                                          restype=self.hetid,
                                          resnr=self.position, reschain=self.chain,
                                          location='ligand', orig_atom=self.Mapper.id_to_atom(a_orig_idx)))
            if a.atomicnum == 15:  # It's a phosphor atom
                if n_atoms_atomicnum.count(8) >= 3:  # It's a phosphoryl
                    for neighbor in [n for n in n_atoms if n.GetAtomicNum() == 8]:
                        neighbor_orig_idx = self.Mapper.mapid(neighbor.GetIdx(), mtype='ligand', bsid=self.bsid)
                        a_set.append(data(atom=pybel.Atom(neighbor), atom_orig_idx=neighbor_orig_idx, type='O',
                                          fgroup='phosphoryl',
                                          restype=self.hetid,
                                          resnr=self.position, reschain=self.chain,
                                          location='ligand', orig_atom=self.Mapper.id_to_atom(a_orig_idx)))
                if n_atoms_atomicnum.count(8) == 2:  # It's another phosphor-containing group #@todo (correct name?)
                    for neighbor in [n for n in n_atoms if n.GetAtomicNum() == 8]:
                        neighbor_orig_idx = self.Mapper.mapid(neighbor.GetIdx(), mtype='ligand', bsid=self.bsid)
                        a_set.append(data(atom=pybel.Atom(neighbor), atom_orig_idx=neighbor_orig_idx, type='O',
                                          fgroup='phosphor.other', restype=self.hetid,
                                          resnr=self.position,
                                          reschain=self.chain, location='ligand',
                                          orig_atom=self.Mapper.id_to_atom(a_orig_idx)))
            if a.atomicnum == 7:  # It's a nitrogen atom
                if n_atoms_atomicnum.count(6) == 2:  # It's imidazole/pyrrole or similar
                    a_set.append(data(atom=a, atom_orig_idx=a_orig_idx, type='N', fgroup='imidazole/pyrrole',
                                      restype=self.hetid, resnr=self.position, reschain=self.chain,
                                      location='ligand', orig_atom=self.Mapper.id_to_atom(a_orig_idx)))
            if a.atomicnum == 16:  # It's a sulfur atom
                if True in [n.IsAromatic() for n in n_atoms] and not a.OBAtom.IsAromatic():  # Thiolate
                    a_set.append(data(atom=a, atom_orig_idx=a_orig_idx, type='S', fgroup='thiolate',
                                      restype=self.hetid, resnr=self.position, reschain=self.chain,
                                      location='ligand', orig_atom=self.Mapper.id_to_atom(a_orig_idx)))
                if set(n_atoms_atomicnum) == {26}:  # Sulfur in Iron sulfur cluster
                    a_set.append(data(atom=a, atom_orig_idx=a_orig_idx, type='S', fgroup='iron-sulfur.cluster',
                                      restype=self.hetid, resnr=self.position, reschain=self.chain,
                                      location='ligand', orig_atom=self.Mapper.id_to_atom(a_orig_idx)))

        return a_set


class PDBComplex:
    """Contains a collection of objects associated with a PDB complex, i.e. one or several ligands and their binding
    sites as well as information about the pliprofiler between them. Provides functions to load and prepare input files
    such as PDB files.
    """

    def __init__(self):
        self.interaction_sets = {}  # Dictionary with site identifiers as keys and object as value
        self.protcomplex = None
        self.filetype = None
        self.atoms = {}  # Dictionary of Pybel atoms, accessible by their idx
        self.sourcefiles = {}
        self.information = {}
        self.corrected_pdb = ''
        self._output_path = tempfile.gettempdir()
        self.pymol_name = None
        self.modres = set()
        self.resis = []
        self.altconf = []  # Atom idx of atoms with alternate conformations
        self.covalent = []  # Covalent linkages between ligands and protein residues/other ligands
        self.excluded = []  # Excluded ligands
        self.Mapper = Mapper()
        self.ligands = []

    def __str__(self):
        formatted_lig_names = [":".join([x.hetid, x.chain, str(x.position)]) for x in self.ligands]
        return "Protein structure %s with ligands:\n" % (self.pymol_name) + "\n".join(
            [lig for lig in formatted_lig_names])

    def load_pdb(self, pdbpath, as_string=False):
        """Loads a pdb file with protein AND ligand(s), separates and prepares them.
        If specified 'as_string', the input is a PDB string instead of a path."""
        if as_string:
            self.sourcefiles['pdbcomplex.original'] = None
            self.sourcefiles['pdbcomplex'] = None
            self.sourcefiles['pdbstring'] = pdbpath
        else:
            self.sourcefiles['pdbcomplex.original'] = pdbpath
            self.sourcefiles['pdbcomplex'] = pdbpath
        self.information['pdbfixes'] = False
        pdbparser = PDBParser(pdbpath, as_string=as_string)  # Parse PDB file to find errors and get additional data
        # #@todo Refactor and rename here
        self.Mapper.proteinmap = pdbparser.proteinmap
        self.Mapper.reversed_proteinmap = {v: k for k, v in self.Mapper.proteinmap.items()}
        self.modres = pdbparser.modres
        self.covalent = pdbparser.covalent
        self.altconf = pdbparser.altconformations
        self.corrected_pdb = pdbparser.corrected_pdb

        if not config.PLUGIN_MODE:
            if pdbparser.num_fixed_lines > 0:
                logger.info(f'{pdbparser.num_fixed_lines} lines automatically fixed in PDB input file')
                # Save modified PDB file
                if not as_string:
                    basename = os.path.basename(pdbpath).split('.')[0]
                else:
                    basename = "from_stdin"
                pdbpath_fixed = tmpfile(prefix='plipfixed.' + basename + '_', direc=self.output_path)
                create_folder_if_not_exists(self.output_path)
                self.sourcefiles['pdbcomplex'] = pdbpath_fixed
                self.corrected_pdb = re.sub(r'[^\x00-\x7F]+', ' ', self.corrected_pdb)  # Strip non-unicode chars
                if not config.NOFIXFILE:  # Only write to file if this option is not activated
                    with open(pdbpath_fixed, 'w') as f:
                        f.write(self.corrected_pdb)
                self.information['pdbfixes'] = True

        if not as_string:
            self.sourcefiles['filename'] = os.path.basename(self.sourcefiles['pdbcomplex'])
        self.protcomplex, self.filetype = read_pdb(self.corrected_pdb, as_string=True)

        # Update the model in the Mapper class instance
        self.Mapper.original_structure = self.protcomplex.OBMol
        logger.info('PDB structure successfully read')

        # Determine (temporary) PyMOL Name from Filename
        self.pymol_name = pdbpath.split('/')[-1].split('.')[0] + '-Protein'
        # Replace characters causing problems in PyMOL
        self.pymol_name = self.pymol_name.replace(' ', '').replace('(', '').replace(')', '').replace('-', '_')
        # But if possible, name it after PDBID in Header
        if 'HEADER' in self.protcomplex.data:  # If the PDB file has a proper header
            potential_name = self.protcomplex.data['HEADER'][56:60].lower()
            if extract_pdbid(potential_name) != 'UnknownProtein':
                self.pymol_name = potential_name
        logger.debug(f'PyMOL name set as: {self.pymol_name}')

        # Extract and prepare ligands
        ligandfinder = LigandFinder(self.protcomplex, self.altconf, self.modres, self.covalent, self.Mapper)
        self.ligands = ligandfinder.ligands
        self.excluded = ligandfinder.excluded

        # decide whether to add polar hydrogens
        if not config.NOHYDRO:
            if not as_string:
                basename = os.path.basename(pdbpath).split('.')[0]
            else:
                basename = "from_stdin"
            self.protcomplex.OBMol.AddPolarHydrogens()
            output_path = os.path.join(self._output_path, f'{basename}_protonated.pdb')
            self.protcomplex.write('pdb', output_path, overwrite=True)
            logger.info(f'protonated structure written to {output_path}')
        else:
            logger.warning('no polar hydrogens will be assigned (make sure your structure contains hydrogens)')

        for atm in self.protcomplex:
            self.atoms[atm.idx] = atm

        if len(self.excluded) != 0:
            logger.info(f'excluded molecules as ligands: {self.excluded}')

        if config.DNARECEPTOR:
            self.resis = [obres for obres in pybel.ob.OBResidueIter(
                self.protcomplex.OBMol) if obres.GetName() in config.DNA + config.RNA]
        else:
            self.resis = [obres for obres in pybel.ob.OBResidueIter(
                self.protcomplex.OBMol) if obres.GetResidueProperty(0)]

        num_ligs = len(self.ligands)
        if num_ligs == 1:
            logger.info('analyzing one ligand')
        elif num_ligs > 1:
            logger.info(f'analyzing {num_ligs} ligands')
        else:
            logger.info(f'structure contains no ligands')

    def analyze(self):
        """Triggers analysis of all complexes in structure"""
        for ligand in self.ligands:
            self.characterize_complex(ligand)

    def characterize_complex(self, ligand):
        """Handles all basic functions for characterizing the interactions for one ligand"""

        single_sites = []
        for member in ligand.members:
            single_sites.append(':'.join([str(x) for x in member]))
        site = ' + '.join(single_sites)
        site = site if not len(site) > 20 else site[:20] + '...'
        longname = ligand.longname if not len(ligand.longname) > 20 else ligand.longname[:20] + '...'
        ligtype = 'unspecified type' if ligand.type == 'UNSPECIFIED' else ligand.type
        ligtext = f'{longname} [{ligtype}] -- {site}'
        logger.info(f'processing ligand {ligtext}')
        if ligtype == 'PEPTIDE':
            logger.info(f'chain {ligand.chain} will be processed in [PEPTIDE / INTER-CHAIN] mode')
        if ligtype == 'INTRA':
            logger.info(f'chain {ligand.chain} will be processed in [INTRA-CHAIN] mode')
        any_in_biolip = len(set([x[0] for x in ligand.members]).intersection(config.biolip_list)) != 0

        if ligtype not in ['POLYMER', 'DNA', 'ION', 'DNA+ION', 'RNA+ION', 'SMALLMOLECULE+ION'] and any_in_biolip:
            logger.info('may be biologically irrelevant')

        lig_obj = Ligand(self, ligand)
        cutoff = lig_obj.max_dist_to_center + config.BS_DIST
        bs_res = self.extract_bs(cutoff, lig_obj.centroid, self.resis)
        # Get a list of all atoms belonging to the binding site, search by idx
        bs_atoms = [self.atoms[idx] for idx in [i for i in self.atoms.keys()
                                                if self.atoms[i].OBAtom.GetResidue().GetIdx() in bs_res]
                    if idx in self.Mapper.proteinmap and self.Mapper.mapid(idx, mtype='protein') not in self.altconf]
        if ligand.type == 'PEPTIDE':
            # If peptide, don't consider the peptide chain as part of the protein binding site
            bs_atoms = [a for a in bs_atoms if a.OBAtom.GetResidue().GetChain() != lig_obj.chain]
        if ligand.type == 'INTRA':
            # Interactions within the chain
            bs_atoms = [a for a in bs_atoms if a.OBAtom.GetResidue().GetChain() == lig_obj.chain]
        bs_atoms_refined = []

        # Create hash with BSRES -> (MINDIST_TO_LIG, AA_TYPE)
        # and refine binding site atom selection with exact threshold
        min_dist = {}
        for r in bs_atoms:
            bs_res_id = ''.join([str(whichresnumber(r)), whichchain(r)])
            for l in ligand.mol.atoms:
                distance = euclidean3d(r.coords, l.coords)
                if bs_res_id not in min_dist:
                    min_dist[bs_res_id] = (distance, whichrestype(r))
                elif min_dist[bs_res_id][0] > distance:
                    min_dist[bs_res_id] = (distance, whichrestype(r))
                if distance <= config.BS_DIST and r not in bs_atoms_refined:
                    bs_atoms_refined.append(r)
        num_bs_atoms = len(bs_atoms_refined)
        logger.info(f'binding site atoms in vicinity ({config.BS_DIST} A max. dist: {num_bs_atoms})')

        bs_obj = BindingSite(bs_atoms_refined, self.protcomplex, self, self.altconf, min_dist, self.Mapper)
        pli_obj = PLInteraction(lig_obj, bs_obj, self)
        self.interaction_sets[ligand.mol.title] = pli_obj

    def extract_bs(self, cutoff, ligcentroid, resis):
        """Return list of ids from residues belonging to the binding site"""
        return [obres.GetIdx() for obres in resis if self.res_belongs_to_bs(obres, cutoff, ligcentroid)]

    def res_belongs_to_bs(self, res, cutoff, ligcentroid):
        """Check for each residue if its centroid is within a certain distance to the ligand centroid.
        Additionally checks if a residue belongs to a chain restricted by the user (e.g. by defining a peptide chain)"""
        rescentroid = centroid([(atm.x(), atm.y(), atm.z()) for atm in pybel.ob.OBResidueAtomIter(res)])
        # Check geometry
        near_enough = True if euclidean3d(rescentroid, ligcentroid) < cutoff else False
        # Check chain membership
        restricted_chain = True if res.GetChain() in config.PEPTIDES else False
        return (near_enough and not restricted_chain)

    def get_atom(self, idx):
        return self.atoms[idx]

    @property
    def output_path(self):
        return self._output_path

    @output_path.setter
    def output_path(self, path):
        self._output_path = tilde_expansion(path)