aboutsummaryrefslogtreecommitdiff
path: root/app/lstm_chem/utils/smiles_tokenizer.py
diff options
context:
space:
mode:
Diffstat (limited to 'app/lstm_chem/utils/smiles_tokenizer.py')
-rwxr-xr-xapp/lstm_chem/utils/smiles_tokenizer.py72
1 files changed, 72 insertions, 0 deletions
diff --git a/app/lstm_chem/utils/smiles_tokenizer.py b/app/lstm_chem/utils/smiles_tokenizer.py
new file mode 100755
index 0000000..d15d625
--- /dev/null
+++ b/app/lstm_chem/utils/smiles_tokenizer.py
@@ -0,0 +1,72 @@
+import copy
+import numpy as np
+
+import time
+
+
+class SmilesTokenizer(object):
+ def __init__(self):
+ atoms = [
+ 'Li',
+ 'Na',
+ 'Al',
+ 'Si',
+ 'Cl',
+ 'Sc',
+ 'Zn',
+ 'As',
+ 'Se',
+ 'Br',
+ 'Sn',
+ 'Te',
+ 'Cn',
+ 'H',
+ 'B',
+ 'C',
+ 'N',
+ 'O',
+ 'F',
+ 'P',
+ 'S',
+ 'K',
+ 'V',
+ 'I',
+ ]
+ special = [
+ '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5',
+ '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
+ ]
+ padding = ['G', 'A', 'E']
+
+ self.table = sorted(atoms, key=len, reverse=True) + special + padding
+ self.table_len = len(self.table)
+
+ self.one_hot_dict = {}
+ for i, symbol in enumerate(self.table):
+ vec = np.zeros(self.table_len, dtype=np.float32)
+ vec[i] = 1
+ self.one_hot_dict[symbol] = vec
+
+ def tokenize(self, smiles):
+ N = len(smiles)
+ i = 0
+ token = []
+
+ timeout = time.time() + 5 # 5 seconds from now
+ while (i < N):
+ for j in range(self.table_len):
+ symbol = self.table[j]
+ if symbol == smiles[i:i + len(symbol)]:
+ token.append(symbol)
+ i += len(symbol)
+ break
+ if time.time() > timeout:
+ break
+ return token
+
+ def one_hot_encode(self, tokenized_smiles):
+ result = np.array(
+ [self.one_hot_dict[symbol] for symbol in tokenized_smiles],
+ dtype=np.float32)
+ result = result.reshape(1, result.shape[0], result.shape[1])
+ return result