aboutsummaryrefslogtreecommitdiff
path: root/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py
diff options
context:
space:
mode:
Diffstat (limited to 'lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py')
-rwxr-xr-xlstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py72
1 files changed, 0 insertions, 72 deletions
diff --git a/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py b/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py
deleted file mode 100755
index d15d625..0000000
--- a/lstm_chem/utils/smiles_tokenizer.py.4faeffb638548d04ca4415dfe32cf8c7.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import copy
-import numpy as np
-
-import time
-
-
-class SmilesTokenizer(object):
- def __init__(self):
- atoms = [
- 'Li',
- 'Na',
- 'Al',
- 'Si',
- 'Cl',
- 'Sc',
- 'Zn',
- 'As',
- 'Se',
- 'Br',
- 'Sn',
- 'Te',
- 'Cn',
- 'H',
- 'B',
- 'C',
- 'N',
- 'O',
- 'F',
- 'P',
- 'S',
- 'K',
- 'V',
- 'I',
- ]
- special = [
- '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5',
- '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
- ]
- padding = ['G', 'A', 'E']
-
- self.table = sorted(atoms, key=len, reverse=True) + special + padding
- self.table_len = len(self.table)
-
- self.one_hot_dict = {}
- for i, symbol in enumerate(self.table):
- vec = np.zeros(self.table_len, dtype=np.float32)
- vec[i] = 1
- self.one_hot_dict[symbol] = vec
-
- def tokenize(self, smiles):
- N = len(smiles)
- i = 0
- token = []
-
- timeout = time.time() + 5 # 5 seconds from now
- while (i < N):
- for j in range(self.table_len):
- symbol = self.table[j]
- if symbol == smiles[i:i + len(symbol)]:
- token.append(symbol)
- i += len(symbol)
- break
- if time.time() > timeout:
- break
- return token
-
- def one_hot_encode(self, tokenized_smiles):
- result = np.array(
- [self.one_hot_dict[symbol] for symbol in tokenized_smiles],
- dtype=np.float32)
- result = result.reshape(1, result.shape[0], result.shape[1])
- return result